sklearn实战:文档分类预测(朴素贝叶斯算法)

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.datasets import load_files

print("loading train dataset ...")
t = time()
news_train = load_files('datasets/mlcomp/379/train')
news_train.data #数组,所有文档的文本信息
news_train.target #数组,文档所属类别(数字)
news_train.target_names# 数组,文档所有类别的名称
loading train dataset ...





['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
print("summary: {0} documents in {1} categories.".format(
    len(news_train.data), len(news_train.target_names)))
print("done in {0} seconds".format(time() - t))
#语料库中训练集有13180个文档,20个类别
summary: 13180 documents in 20 categories.
done in 3.2623984813690186 seconds
#把所有文档转换为由TF-IDF表达的权重信息构成的向量
from sklearn.feature_extraction.text import TfidfVectorizer

print("vectorizing train dataset ...")
t = time()
vectorizer = TfidfVectorizer(encoding='latin-1')
X_train = vectorizer.fit_transform((d for d in news_train.data))
#TfidfVectorizer把所有文档转成矩阵,每行是一个文档,用TF-IDF表示对应该文档词的稀疏向量
X_train[0]
vectorizing train dataset ...





<1x130274 sparse matrix of type ''
    with 108 stored elements in Compressed Sparse Row format>
print("n_samples: %d, n_features: %d" % X_train.shape) 
print("number of non-zero features in sample [{0}]: {1}".format(
    news_train.filenames[0], X_train[0].getnnz()))  #第一篇文档的名字,矩阵第一行分非0元素个数
print("done in {0} seconds".format(time() - t))

#看出有矩阵有13180行就是13180个文档,每个文档相当于130274维的稀疏量(是所有文档出现的词)
#第一篇文档中由108个不重复的词组成,没歌词对应TF-IDF值
#X_train是一个维度13180 X 130274的稀疏矩阵
n_samples: 13180, n_features: 130274
number of non-zero features in sample [datasets/mlcomp/379/train\talk.politics.misc\17860-178992]: 108
done in 3.9878034591674805 seconds
from sklearn.naive_bayes import MultinomialNB

print("traning models ...".format(time() - t))
t = time()
y_train = news_train.target
clf = MultinomialNB(alpha=0.0001)# alpha越小越容易过拟合
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
print("train score: {0}".format(train_score))
print("done in {0} seconds".format(time() - t))
traning models ...
train score: 0.9978755690440061
done in 0.25583958625793457 seconds
#网格搜索交叉验证
from sklearn.model_selection import GridSearchCV

# alpha_list= np.linspace(0.01,0.001,100)
alpha_list=[0.01,0.001,0.0001]

# Set the parameters by cross-validation
param_grid = [{'alpha': alpha_list, }]

clf = GridSearchCV(MultinomialNB(), param_grid, cv=5)
clf.fit(X_train, y_train)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, 
                                                clf.best_score_))
best param: {'alpha': 0.01}
best score: 0.9106980273141123
#加载测试集
print("loading test dataset ...")
t = time()
news_test = load_files('datasets/mlcomp/379/test')
print("summary: {0} documents in {1} categories.".format(
    len(news_test.data), len(news_test.target_names)))
print("done in {0} seconds".format(time() - t))
#可以看到,测试集由5648个文档,20个类别
loading test dataset ...
summary: 5648 documents in 20 categories.
done in 49.46089696884155 seconds
#文档向量化
print("vectorizing test dataset ...")
t = time()
X_test = vectorizer.transform((d for d in news_test.data))#不用fit(语料库分析,提取字典)了,trandform文档转向量
y_test = news_test.target
print("n_samples: %d, n_features: %d" % X_test.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    news_test.filenames[0], X_test[0].getnnz()))
print("done in %fs" % (time() - t))
vectorizing test dataset ...
n_samples: 5648, n_features: 130274
number of non-zero features in sample [datasets/mlcomp/379/test\rec.autos\7429-103268]: 61
done in 1.470165s
#预测第一篇文档
pred = clf.predict(X_test[0])
pred
print("predict: {0} is in category {1}".format(
    news_test.filenames[0], news_test.target_names[pred[0]]))
print("actually: {0} is in category {1}".format(
    news_test.filenames[0], news_test.target_names[news_test.target[0]]))
predict: datasets/mlcomp/379/test\rec.autos\7429-103268 is in category rec.autos
actually: datasets/mlcomp/379/test\rec.autos\7429-103268 is in category rec.autos
print("predicting test dataset ...")
t = time()
pred = clf.predict(X_test)
print("done in %fs" % (time() - t))
predicting test dataset ...
done in 0.052190s
from sklearn.metrics import classification_report

print("classification report on test set for classifier:")
print(clf)
print(classification_report(y_test, pred,
                            target_names=news_test.target_names))

#recall 体现了分类模型H对正样本的识别能力,recall 越高,说明模型对正样本的识别能力越强
#precision 体现了模型对负样本的区分能力,precision越高,说明模型对负样本的区分能力越强。
#F1-score 是两者的综合。F1-score 越高,说明分类模型越稳健。
classification report on test set for classifier:
MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)
                          precision    recall  f1-score   support

             alt.atheism       0.90      0.91      0.91       245
           comp.graphics       0.80      0.90      0.85       298
 comp.os.ms-windows.misc       0.82      0.79      0.80       292
comp.sys.ibm.pc.hardware       0.81      0.80      0.81       301
   comp.sys.mac.hardware       0.90      0.91      0.91       256
          comp.windows.x       0.88      0.88      0.88       297
            misc.forsale       0.87      0.81      0.84       290
               rec.autos       0.92      0.93      0.92       324
         rec.motorcycles       0.96      0.96      0.96       294
      rec.sport.baseball       0.97      0.94      0.96       315
        rec.sport.hockey       0.96      0.99      0.98       302
               sci.crypt       0.95      0.96      0.95       297
         sci.electronics       0.91      0.85      0.88       313
                 sci.med       0.96      0.96      0.96       277
               sci.space       0.94      0.97      0.96       305
  soc.religion.christian       0.93      0.96      0.94       293
      talk.politics.guns       0.91      0.96      0.93       246
   talk.politics.mideast       0.96      0.98      0.97       296
      talk.politics.misc       0.90      0.90      0.90       236
      talk.religion.misc       0.89      0.78      0.83       171

             avg / total       0.91      0.91      0.91      5648
#混淆矩阵,被错误分类的文挡是被错分到哪一类了
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)
print(news_test.target_names)
print("confusion matrix:")
print(cm)
#看到,类别alt.atheism的13个文档被错误分到了talk.religion.misc
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
confusion matrix:
[[224   0   0   0   0   0   0   0   0   0   0   0   0   0   2   5   0   0
    1  13]
 [  1 267   5   5   2   8   1   1   0   0   0   2   3   2   1   0   0   0
    0   0]
 [  1  13 230  24   4  10   5   0   0   0   0   1   2   1   0   0   0   0
    1   0]
 [  0   9  21 242   7   2  10   1   0   0   1   1   7   0   0   0   0   0
    0   0]
 [  0   1   5   5 233   2   2   2   1   0   0   3   1   0   1   0   0   0
    0   0]
 [  0  20   6   3   1 260   0   0   0   2   0   1   0   0   2   0   2   0
    0   0]
 [  0   2   5  12   3   1 235  10   2   3   1   0   7   0   2   0   2   1
    4   0]
 [  0   1   0   0   1   0   8 300   4   1   0   0   1   2   3   0   2   0
    1   0]
 [  0   1   0   0   0   2   2   3 283   0   0   0   1   0   0   0   0   0
    1   1]
 [  0   1   1   0   1   2   1   2   0 297   8   1   0   1   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   2   2 298   0   0   0   0   0   0   0
    0   0]
 [  0   1   2   0   0   1   1   0   0   0   0 284   2   1   0   0   2   1
    2   0]
 [  0  11   3   5   4   2   4   5   1   1   0   4 266   1   4   0   1   0
    1   0]
 [  1   1   0   1   0   2   1   0   0   0   0   0   1 266   2   1   0   0
    1   0]
 [  0   3   0   0   1   1   0   0   0   0   0   1   0   1 296   0   1   0
    1   0]
 [  3   1   0   1   0   0   0   0   0   0   1   0   0   2   1 280   0   1
    1   2]
 [  1   0   2   0   0   0   0   0   1   0   0   0   0   0   0   0 236   1
    4   1]
 [  1   0   0   0   0   1   0   0   0   0   0   0   0   0   0   3   0 290
    1   0]
 [  2   1   0   0   1   1   0   1   0   0   0   0   0   0   0   1  10   7
  212   0]
 [ 16   0   0   0   0   0   0   0   0   0   0   0   0   0   0  12   4   1
    4 134]]
# Show confusion matrix
plt.figure(figsize=(8, 8), dpi=144)
plt.title('Confusion matrix of the classifier')
ax = plt.gca()                                  
ax.spines['right'].set_color('none')            
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.matshow(cm, fignum=1, cmap='gray')
plt.colorbar();

sklearn实战:文档分类预测(朴素贝叶斯算法)_第1张图片

除了对角线外,其他地方颜色越浅,说明此处错误越多

你可能感兴趣的:(机器学习,scikit-learn)