%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.datasets import load_files
print("loading train dataset ...")
t = time()
news_train = load_files('datasets/mlcomp/379/train')
news_train.data #数组,所有文档的文本信息
news_train.target #数组,文档所属类别(数字)
news_train.target_names# 数组,文档所有类别的名称
loading train dataset ...
['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']
print("summary: {0} documents in {1} categories.".format(
len(news_train.data), len(news_train.target_names)))
print("done in {0} seconds".format(time() - t))
#语料库中训练集有13180个文档,20个类别
summary: 13180 documents in 20 categories.
done in 3.2623984813690186 seconds
#把所有文档转换为由TF-IDF表达的权重信息构成的向量
from sklearn.feature_extraction.text import TfidfVectorizer
print("vectorizing train dataset ...")
t = time()
vectorizer = TfidfVectorizer(encoding='latin-1')
X_train = vectorizer.fit_transform((d for d in news_train.data))
#TfidfVectorizer把所有文档转成矩阵,每行是一个文档,用TF-IDF表示对应该文档词的稀疏向量
X_train[0]
vectorizing train dataset ...
<1x130274 sparse matrix of type ''
with 108 stored elements in Compressed Sparse Row format>
print("n_samples: %d, n_features: %d" % X_train.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
news_train.filenames[0], X_train[0].getnnz())) #第一篇文档的名字,矩阵第一行分非0元素个数
print("done in {0} seconds".format(time() - t))
#看出有矩阵有13180行就是13180个文档,每个文档相当于130274维的稀疏量(是所有文档出现的词)
#第一篇文档中由108个不重复的词组成,没歌词对应TF-IDF值
#X_train是一个维度13180 X 130274的稀疏矩阵
n_samples: 13180, n_features: 130274
number of non-zero features in sample [datasets/mlcomp/379/train\talk.politics.misc\17860-178992]: 108
done in 3.9878034591674805 seconds
from sklearn.naive_bayes import MultinomialNB
print("traning models ...".format(time() - t))
t = time()
y_train = news_train.target
clf = MultinomialNB(alpha=0.0001)# alpha越小越容易过拟合
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
print("train score: {0}".format(train_score))
print("done in {0} seconds".format(time() - t))
traning models ...
train score: 0.9978755690440061
done in 0.25583958625793457 seconds
#网格搜索交叉验证
from sklearn.model_selection import GridSearchCV
# alpha_list= np.linspace(0.01,0.001,100)
alpha_list=[0.01,0.001,0.0001]
# Set the parameters by cross-validation
param_grid = [{'alpha': alpha_list, }]
clf = GridSearchCV(MultinomialNB(), param_grid, cv=5)
clf.fit(X_train, y_train)
print("best param: {0}\nbest score: {1}".format(clf.best_params_,
clf.best_score_))
best param: {'alpha': 0.01}
best score: 0.9106980273141123
#加载测试集
print("loading test dataset ...")
t = time()
news_test = load_files('datasets/mlcomp/379/test')
print("summary: {0} documents in {1} categories.".format(
len(news_test.data), len(news_test.target_names)))
print("done in {0} seconds".format(time() - t))
#可以看到,测试集由5648个文档,20个类别
loading test dataset ...
summary: 5648 documents in 20 categories.
done in 49.46089696884155 seconds
#文档向量化
print("vectorizing test dataset ...")
t = time()
X_test = vectorizer.transform((d for d in news_test.data))#不用fit(语料库分析,提取字典)了,trandform文档转向量
y_test = news_test.target
print("n_samples: %d, n_features: %d" % X_test.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
news_test.filenames[0], X_test[0].getnnz()))
print("done in %fs" % (time() - t))
vectorizing test dataset ...
n_samples: 5648, n_features: 130274
number of non-zero features in sample [datasets/mlcomp/379/test\rec.autos\7429-103268]: 61
done in 1.470165s
#预测第一篇文档
pred = clf.predict(X_test[0])
pred
print("predict: {0} is in category {1}".format(
news_test.filenames[0], news_test.target_names[pred[0]]))
print("actually: {0} is in category {1}".format(
news_test.filenames[0], news_test.target_names[news_test.target[0]]))
predict: datasets/mlcomp/379/test\rec.autos\7429-103268 is in category rec.autos
actually: datasets/mlcomp/379/test\rec.autos\7429-103268 is in category rec.autos
print("predicting test dataset ...")
t = time()
pred = clf.predict(X_test)
print("done in %fs" % (time() - t))
predicting test dataset ...
done in 0.052190s
from sklearn.metrics import classification_report
print("classification report on test set for classifier:")
print(clf)
print(classification_report(y_test, pred,
target_names=news_test.target_names))
#recall 体现了分类模型H对正样本的识别能力,recall 越高,说明模型对正样本的识别能力越强
#precision 体现了模型对负样本的区分能力,precision越高,说明模型对负样本的区分能力越强。
#F1-score 是两者的综合。F1-score 越高,说明分类模型越稳健。
classification report on test set for classifier:
MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)
precision recall f1-score support
alt.atheism 0.90 0.91 0.91 245
comp.graphics 0.80 0.90 0.85 298
comp.os.ms-windows.misc 0.82 0.79 0.80 292
comp.sys.ibm.pc.hardware 0.81 0.80 0.81 301
comp.sys.mac.hardware 0.90 0.91 0.91 256
comp.windows.x 0.88 0.88 0.88 297
misc.forsale 0.87 0.81 0.84 290
rec.autos 0.92 0.93 0.92 324
rec.motorcycles 0.96 0.96 0.96 294
rec.sport.baseball 0.97 0.94 0.96 315
rec.sport.hockey 0.96 0.99 0.98 302
sci.crypt 0.95 0.96 0.95 297
sci.electronics 0.91 0.85 0.88 313
sci.med 0.96 0.96 0.96 277
sci.space 0.94 0.97 0.96 305
soc.religion.christian 0.93 0.96 0.94 293
talk.politics.guns 0.91 0.96 0.93 246
talk.politics.mideast 0.96 0.98 0.97 296
talk.politics.misc 0.90 0.90 0.90 236
talk.religion.misc 0.89 0.78 0.83 171
avg / total 0.91 0.91 0.91 5648
#混淆矩阵,被错误分类的文挡是被错分到哪一类了
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
print(news_test.target_names)
print("confusion matrix:")
print(cm)
#看到,类别alt.atheism的13个文档被错误分到了talk.religion.misc
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
confusion matrix:
[[224 0 0 0 0 0 0 0 0 0 0 0 0 0 2 5 0 0
1 13]
[ 1 267 5 5 2 8 1 1 0 0 0 2 3 2 1 0 0 0
0 0]
[ 1 13 230 24 4 10 5 0 0 0 0 1 2 1 0 0 0 0
1 0]
[ 0 9 21 242 7 2 10 1 0 0 1 1 7 0 0 0 0 0
0 0]
[ 0 1 5 5 233 2 2 2 1 0 0 3 1 0 1 0 0 0
0 0]
[ 0 20 6 3 1 260 0 0 0 2 0 1 0 0 2 0 2 0
0 0]
[ 0 2 5 12 3 1 235 10 2 3 1 0 7 0 2 0 2 1
4 0]
[ 0 1 0 0 1 0 8 300 4 1 0 0 1 2 3 0 2 0
1 0]
[ 0 1 0 0 0 2 2 3 283 0 0 0 1 0 0 0 0 0
1 1]
[ 0 1 1 0 1 2 1 2 0 297 8 1 0 1 0 0 0 0
0 0]
[ 0 0 0 0 0 0 0 0 2 2 298 0 0 0 0 0 0 0
0 0]
[ 0 1 2 0 0 1 1 0 0 0 0 284 2 1 0 0 2 1
2 0]
[ 0 11 3 5 4 2 4 5 1 1 0 4 266 1 4 0 1 0
1 0]
[ 1 1 0 1 0 2 1 0 0 0 0 0 1 266 2 1 0 0
1 0]
[ 0 3 0 0 1 1 0 0 0 0 0 1 0 1 296 0 1 0
1 0]
[ 3 1 0 1 0 0 0 0 0 0 1 0 0 2 1 280 0 1
1 2]
[ 1 0 2 0 0 0 0 0 1 0 0 0 0 0 0 0 236 1
4 1]
[ 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 3 0 290
1 0]
[ 2 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 10 7
212 0]
[ 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 4 1
4 134]]
# Show confusion matrix
plt.figure(figsize=(8, 8), dpi=144)
plt.title('Confusion matrix of the classifier')
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.matshow(cm, fignum=1, cmap='gray')
plt.colorbar();
除了对角线外,其他地方颜色越浅,说明此处错误越多