TF-IDF(C#)

翻代码时看到以前写的TF-IDF的C#实现,共享一下..

 

ps: codeproject.com有一个泰国仔实现的版本,代码写得非常乱..

 

代码
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Cluster
{
    
///   <summary>
    
///  词项
    
///   </summary>
     class  Term
    {
        
///   <summary>
        
///  词在词表中的索引(在线性词表中的序号)
        
///   </summary>
         public   int  index;

        
///   <summary>
        
///  词出现过的文档数(在多少篇文章出现过)
        
///   </summary>
         public   int  docNum;


        
///   <summary>
        
///  
        
///   </summary>
         public  Term( int  index)
        {
            
this .index  =  index;
        }
    }
}

 

代码
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Cluster
{
    
///   <summary>
    
///  term frequency–inverse document frequency
    
///   </summary>
     static   class  TFIDF
    {
        
///   <summary>
        
///  计算tf-idf
        
///   </summary>
        
///   <param name="docs"> 待处理文档(已分词) </param>
        
///   <returns></returns>
         public   static  List < Dictionary < int double >>  Calculate( string [][] docs)
        {
            List
< Dictionary < int double >>  tfidfs  =   new  List < Dictionary < int double >> ();

            Dictionary
< string , Term >  terms  =   new  Dictionary < string , Term > ();  // 词表
            List < Dictionary < int double >>  tfs  =   new  List < Dictionary < int double >> ();  // 词频
            Dictionary < int double >  idfs  =   new  Dictionary < int double > ();  // 逆文档频率

            CalcTF(docs, terms, tfs);
            CalcIDF(docs, terms, idfs);
            CalcTFIDF(tfs, idfs, tfidfs);

            
return  tfidfs;
        }

        
#region  TF
        
///   <summary>
        
///  计算词频(term frequency)
        
///   </summary>
        
///   <param name="docs"> 文档 </param>
        
///   <param name="terms"> 词表 </param>
        
///   <param name="tfs"> 词数 </param>
         private   static   void  CalcTF( string [][] docs, Dictionary < string , Term >  terms, List < Dictionary < int double >>  tfs)
        {
            
foreach  ( string [] doc  in  docs)
            {
                Dictionary
< int int >  termNums  =   new  Dictionary < int int > ();
                
foreach  ( string  term  in  doc)
                {
                    
int  index  =   - 1 // 词表索引
                     if  ( ! terms.ContainsKey(term))
                    {
                        index 
=  terms.Count;
                        terms.Add(term, 
new  Term(index));
                    }
                    
else
                    {
                        index 
=  terms[term].index;
                    }
                    
if  ( ! termNums.ContainsKey(index))
                    {
                        termNums.Add(index, 
1 );
                        terms[term].docNum
++ // 词的文档数
                    }
                    
else
                    {
                        termNums[index]
++ ;
                    }
                }
                
double  len  =  ( double )doc.Length;
                Dictionary
< int double >  tf  =   new  Dictionary < int double > ();  // 词频
                 foreach  (KeyValuePair < int int >  kvp  in  termNums)
                {
                    tf.Add(kvp.Key, (
double )kvp.Value  /  len);  // 当前词的词数/总词数
                }
                tfs.Add(tf);
            }
        }
        
#endregion

        
#region  IDF
        
///   <summary>
        
///  计算逆文档频率(inverse document frequency)
        
///   </summary>
        
///   <param name="docs"></param>
        
///   <param name="terms"></param>
        
///   <param name="idfs"></param>
         private   static   void  CalcIDF( string [][] docs, Dictionary < string , Term >  terms, Dictionary < int double >  idfs)
        {
            
double  len  =  ( double )docs.Length;
            
foreach  (KeyValuePair < string , Term >  kvp  in  terms)
            {
                
double  idf  =  Math.Log(len  /  ( double )kvp.Value.docNum, Math.E);  // ln(总文档数/当前词出现过的文档数)
                idfs.Add(kvp.Value.index, idf);
            }
        }
        
#endregion

        
#region  TF-IDF
        
///   <summary>
        
///  
        
///   </summary>
        
///   <param name="tfs"></param>
        
///   <param name="idfs"></param>
        
///   <param name="tfidfs"></param>
         private   static   void  CalcTFIDF(List < Dictionary < int double >>  tfs, Dictionary < int double >  idfs, List < Dictionary < int double >>  tfidfs)
        {
            
foreach  (Dictionary < int double >  tf  in  tfs)
            {
                Dictionary
< int double >  tfidf  =   new  Dictionary < int double > ();
                
foreach  (KeyValuePair < int double >  kvp  in  tf)
                {
                    tfidf.Add(kvp.Key, kvp.Value 
*  idfs[kvp.Key]);
                }
                tfidfs.Add(tfidf);
            }
        }
        
#endregion

    }
}

 

你可能感兴趣的:(C#)