using
System;
using
System.Collections.Generic;
using
System.Text;
namespace
Cluster
{
///
<summary>
///
term frequency–inverse document frequency
///
</summary>
static
class
TFIDF
{
///
<summary>
///
计算tf-idf
///
</summary>
///
<param name="docs">
待处理文档(已分词)
</param>
///
<returns></returns>
public
static
List
<
Dictionary
<
int
,
double
>>
Calculate(
string
[][] docs)
{
List
<
Dictionary
<
int
,
double
>>
tfidfs
=
new
List
<
Dictionary
<
int
,
double
>>
();
Dictionary
<
string
, Term
>
terms
=
new
Dictionary
<
string
, Term
>
();
//
词表
List
<
Dictionary
<
int
,
double
>>
tfs
=
new
List
<
Dictionary
<
int
,
double
>>
();
//
词频
Dictionary
<
int
,
double
>
idfs
=
new
Dictionary
<
int
,
double
>
();
//
逆文档频率
CalcTF(docs, terms, tfs);
CalcIDF(docs, terms, idfs);
CalcTFIDF(tfs, idfs, tfidfs);
return
tfidfs;
}
#region
TF
///
<summary>
///
计算词频(term frequency)
///
</summary>
///
<param name="docs">
文档
</param>
///
<param name="terms">
词表
</param>
///
<param name="tfs">
词数
</param>
private
static
void
CalcTF(
string
[][] docs, Dictionary
<
string
, Term
>
terms, List
<
Dictionary
<
int
,
double
>>
tfs)
{
foreach
(
string
[] doc
in
docs)
{
Dictionary
<
int
,
int
>
termNums
=
new
Dictionary
<
int
,
int
>
();
foreach
(
string
term
in
doc)
{
int
index
=
-
1
;
//
词表索引
if
(
!
terms.ContainsKey(term))
{
index
=
terms.Count;
terms.Add(term,
new
Term(index));
}
else
{
index
=
terms[term].index;
}
if
(
!
termNums.ContainsKey(index))
{
termNums.Add(index,
1
);
terms[term].docNum
++
;
//
词的文档数
}
else
{
termNums[index]
++
;
}
}
double
len
=
(
double
)doc.Length;
Dictionary
<
int
,
double
>
tf
=
new
Dictionary
<
int
,
double
>
();
//
词频
foreach
(KeyValuePair
<
int
,
int
>
kvp
in
termNums)
{
tf.Add(kvp.Key, (
double
)kvp.Value
/
len);
//
当前词的词数/总词数
}
tfs.Add(tf);
}
}
#endregion
#region
IDF
///
<summary>
///
计算逆文档频率(inverse document frequency)
///
</summary>
///
<param name="docs"></param>
///
<param name="terms"></param>
///
<param name="idfs"></param>
private
static
void
CalcIDF(
string
[][] docs, Dictionary
<
string
, Term
>
terms, Dictionary
<
int
,
double
>
idfs)
{
double
len
=
(
double
)docs.Length;
foreach
(KeyValuePair
<
string
, Term
>
kvp
in
terms)
{
double
idf
=
Math.Log(len
/
(
double
)kvp.Value.docNum, Math.E);
//
ln(总文档数/当前词出现过的文档数)
idfs.Add(kvp.Value.index, idf);
}
}
#endregion
#region
TF-IDF
///
<summary>
///
///
</summary>
///
<param name="tfs"></param>
///
<param name="idfs"></param>
///
<param name="tfidfs"></param>
private
static
void
CalcTFIDF(List
<
Dictionary
<
int
,
double
>>
tfs, Dictionary
<
int
,
double
>
idfs, List
<
Dictionary
<
int
,
double
>>
tfidfs)
{
foreach
(Dictionary
<
int
,
double
>
tf
in
tfs)
{
Dictionary
<
int
,
double
>
tfidf
=
new
Dictionary
<
int
,
double
>
();
foreach
(KeyValuePair
<
int
,
double
>
kvp
in
tf)
{
tfidf.Add(kvp.Key, kvp.Value
*
idfs[kvp.Key]);
}
tfidfs.Add(tfidf);
}
}
#endregion
}
}