train.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import shutil
import jieba
import pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import savefile, readfile, readbunchobj, writebunchobj
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法
from sklearn import metrics
from sklearn.externals import joblib
# 中文分词
def corpus_segment(corpus_path, seg_path): # corpus_path是未分词语料库路径,seg_path是分词后语料库存储路径
catelist = os.listdir(corpus_path) # 获取语料库下的所有一级目录,目录名字就是类别
print("分词中...")
if os.path.exists(seg_path):
shutil.rmtree(seg_path)
for mydir in catelist: # 获取一级目录目录(类别)下所有的文件,mydir是类别名
class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径如:train_corpus/art/
seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径如:train_corpus_seg/art/
if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录
os.makedirs(seg_dir)
else:
shutil.rmtree(seg_dir)
os.makedirs(seg_dir)
file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本,即xxx.txt
for file_path in file_list: # 遍历类别目录下的所有文件
fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/art/21.txt
content = readfile(fullname) # 读取文件内容,此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行
content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行
content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格
content_seg = jieba.cut(content) # 为文件内容分词
savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录
print("中文语料分词结束!!!")
# Bunch化操作
def corpus2Bunch(wordbag_path, seg_path):
catelist = os.listdir(seg_path) # 获取已分词语料库下的所有子目录,也就是分类信息
bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) # 创建一个Bunch实例
bunch.target_name.extend(catelist) # extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充原来的list
for mydir in catelist: # 获取一级目录列表(各大类)
class_path = seg_path + mydir + "/" # 拼出各分类路径
file_list = os.listdir(class_path) # 获取各类中具体txt文件名称列表
for file_path in file_list: # 遍历各类包括的txt文件
fullname = class_path + file_path # 拼出文件名全路径
bunch.label.append(mydir)
bunch.filenames.append(fullname)
bunch.contents.append(readfile(fullname)) # 读取文件内容
with open(wordbag_path, "wb") as file_obj: # 将bunch存储到wordbag_path路径中
pickle.dump(bunch, file_obj)
print("构建文本对象结束!!!")
# 创建向量空间
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
stpwrdlst = readfile(stopword_path).splitlines()
bunch = readbunchobj(bunch_path) # 读取分词后的bunch
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
vocabulary={}) # 创建目标bunch用于存储tfidf,vocabulary是词表,tdm是tfidf值
if train_tfidf_path is not None:
trainbunch = readbunchobj(train_tfidf_path) # 处理测试集时,导入已经训练的训练集
tfidfspace.vocabulary = trainbunch.vocabulary # 测试集的bunch.vacabulary直接采用训练集的
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, # 构建一个vectorizermax_df=0.5
vocabulary=trainbunch.vocabulary) # vectorizer的词表也采用训练集的词表,这么做就无需训练太多词
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) # 计算tfidf,将结果存入目标bunch.tdm
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True) # 构建一个vectorizermax_df=0.5
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) # 计算tfidf,将结果存入目标bunch.tdm
tfidfspace.vocabulary = vectorizer.vocabulary_ # bunch.vacabulary 为vectorizer的词表
writebunchobj(space_path, tfidfspace) # 将tfidfspace存入指定路径
print("tf-idf词向量空间实例创建成功!!!")
if __name__ == "__main__":
train_corpus_path = "train_corpus/" # 未分词分类语料库路径
# test_corpus_path = "test_corpus/" # 未分词分类语料库路径
stopword_path = "stopword.txt" # 加载停用词
train_seg_path = "train_corpus_seg/" # 分词后分类语料库路径
train_bunch_path = "train_word_bag/train_set.dat" # Bunch存储路径
train_space_path = "train_word_bag/tfdifspace.dat"
if not os.path.exists('clf_model_chinese.m'):
# 对训练集进行分词
corpus_segment(train_corpus_path, train_seg_path)
# 对训练集进行Bunch化操作
if not os.path.exists('train_word_bag/'):
os.makedirs('train_word_bag/')
corpus2Bunch(train_bunch_path, train_seg_path)
# 创建训练集tfidf词向量空间
vector_space(stopword_path, train_bunch_path, train_space_path)
'''# 对测试集进行分词
test_seg_path = "test_corpus_seg/" # 分词后分类语料库路径
corpus_segment(test_corpus_path, test_seg_path)
# 对测试集进行Bunch化操作
test_bunch_path = "test_word_bag/test_set.dat" # Bunch存储路径
if not os.path.exists('test_word_bag/'):
os.makedirs('test_word_bag/')
corpus2Bunch(test_bunch_path, test_seg_path)
# 创建测试集tfidf词向量空间
test_space_path = "test_word_bag/tfdifspace.dat"
vector_space(stopword_path, test_bunch_path, test_space_path, train_space_path)'''
# 导入训练集
train_set = readbunchobj(train_space_path)
# 训练分类器:输入词袋向量和分类标签,alpha:0.001 alpha越小,迭代次数越多,精度越高
clf = MultinomialNB(alpha=0.0001).fit(train_set.tdm, train_set.label)
joblib.dump(clf, "clf_model_chinese.m")
else:
print('已完成过训练')
'''
# 导入测试集
testpath = "test_word_bag/tfdifspace.dat"
test_set = readbunchobj(testpath)
# 模型载入
clf = joblib.load('clf_model_chinese.m')
# 预测分类结果
predicted = clf.predict(test_set.tdm)
# 输出分类结果
for flabel, file_name, expct_cate in zip(test_set.label, test_set.filenames, predicted):
print(file_name, " -->预测类别:", expct_cate)
print("预测完毕!!!")
# 计算分类精度
def metrics_result(actual, predict):
print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict, average='weighted')))
print('召回:{0:.3f}'.format(metrics.recall_score(actual, predict, average='weighted')))
print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted')))
metrics_result(test_set.label, predicted)'''
tools.py
#!usr/bin/env python
# -*- coding:utf-8 _*-
import pickle
import os
import re
import shutil
import jieba
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
# 保存至文件
def savefile(savepath, content):
with open(savepath, "wb") as fp:
fp.write(content)
# 读取文件
def readfile(path):
with open(path, "rb") as fp:
content = fp.read()
return content
def writebunchobj(path, bunchobj):
with open(path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
# 读取bunch对象
def readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
classification.py
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj
from sklearn.externals import joblib
def Model(modelpath):
clf = joblib.load(modelpath)
return clf
def ClassificationCN(text, stoppath="stopword.txt",modelpath='clf_model_chinese.m', vocabularypath="train_word_bag/tfdifspace.dat"):
'''模型载入'''
clf = Model(modelpath)
'''文本分类'''
text = text.encode('utf-8')
text = text.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行
text = text.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格
text_seg = jieba.cut(text) # 为文件内容分词 结果是generator
text = ""
for ge in text_seg:
text = text + " " + ge #将结果转化为STR
text = [text] #str转化为list
trainbunch = readbunchobj(vocabularypath) # 处理测试集时,导入已经训练的训练集
stpwrdlst = readfile(stoppath).splitlines()
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary)
tfidf = vectorizer.fit_transform(text) # 计算tfidf
predicted = clf.predict(tfidf)
predicted = predicted[0]
return predicted