CountVectorizer和TfidfTransformer使用计算文本TF-IDF

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

if __name__ == "__main__":
    txt = ["I live coding live",
           "he is a dog",
           "look he likes a dog",
           "it is raining cats and dogs"]
    vectorize = CountVectorizer()
    transformer = TfidfTransformer()
    
    tf = vectorize.fit_transform(txt)
    #将txt转换为词频矩阵
    tf_idf = transformer.fit_transform(tf)
    #计算txt中各个词的TF-IDF值
    
    print(type(tf),tf.shape,tf)
    print(tf_idf)
    
    word = vectorize.get_feature_names()
    #获取识别出来的所有词语
    print(word)
    
    weight = tf_idf.toarray()
    weight_df = pd.DataFrame(weight)
    #print(weight_df)
    weight_df.index = ['文本一','文本二','文本三','文本四']
    for i in range(len(word)):
        weight_df.rename(columns={i:word[i]},inplace = True)
        #dataframe列(column)名更改,行(index)名更改,主要两种方法:
        #强制更改,df.columns=['a','b','c','d'],df.index=['a','b','c']
        #rename更改,df.rename(columns={' ':'Na'},index={'':''},inplace=True),适合批量修改
        
    print(weight_df)
        
    

 

你可能感兴趣的:(CountVectorizer和TfidfTransformer使用计算文本TF-IDF)