本文参考自https://github.com/649453932/Chinese-Text-Classification-Pytorch?tab=readme-ov-file,https://github.com/leerumor/nlp_tutorial?tab=readme-ov-file,https://zhuanlan.zhihu.com/p/73176084,是为了进行NLP的一些典型模型的总结和尝试。
从THUCNews中抽取了20万条新闻标题,文本长度在20到30之间。一共10个类别,每类2万条。
以字为单位输入模型,使用了预训练词向量:搜狗新闻 Word+Character 300d。
类别:财经、房产、股票、教育、科技、社会、时政、体育、游戏、娱乐。
共有两个数据处理脚本utils.py
和utils_fasttext.py
,这里以utils_fasttext.py
为例,其中结合了n-gram信息。以下是代码,添加了注释。
# coding: UTF-8
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta
# 最大词表大小
MAX_VOCAB_SIZE = 10000
# 将UNK,PAD定义为变量,在字典中操作的时候就不需要加引号了
UNK, PAD = '' , ''
# build_vocab接受四个参数:file_path是文本文件的路径,tokenizer是一个分词函数,
# max_size是词汇表最大大小,min_freq是词汇表中词或字符的最小频率
# return一个词汇表字典,键是词,值是索引,这个字典用于将文本转换为数值序列
def build_vocab(file_path, tokenizer, max_size, min_freq):
# 初始化词汇表字典
vocab_dic = {
}
# 读取文件
with open(file_path, 'r', encoding='UTF-8') as f:
# 用tqdm库迭代文件中的每一行,并显示进度条
for line in tqdm(f):
# 去除每行文本前后的空白字符
lin = line.strip()
if not lin:
continue
# 每行文本用制表符分隔,取第一部分内容
content = lin.split('\t')[0]
# 分词器进行分词
for word in tokenizer(content):
# 对每个词更新其在字典中的计数,如果不在就默认为0,然后加1
vocab_dic[word] = vocab_dic.get(word, 0) + 1
# 把字典中的项按照频率降序排列,筛选出频率大于min_freq的词,然后取前max_size个词
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
# 将排序后的词汇列表vocab_list的词作为键,其索引作为值
vocab_dic = {
word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
# 在词汇表中添加特殊标记UNK——未知词,PAD——填充词
vocab_dic.update({
UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic
# build_dataset接收两个参数,config配置信息,use_word决定是否词级分词
def build_dataset(config, ues_word):
# 如果是词级分词,那么用空格分隔,否则都变成char
if ues_word:
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
else:
tokenizer = lambda x: [y for y in x] # char-level
# 检查vocab_path是否存在词表,如果有则加载这个文件,如果没有就用build_vocab创建词表并序列化保存
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
print(f"Vocab size: {
len(vocab)}")
# 定义一个生成二元文法哈希值的函数,接收序列,位置t和桶的数量buckets
def biGramHash(sequence, t, buckets):
t1 = sequence[t - 1] if t - 1 >= 0 else 0
return (t1 * 14918087) % buckets
# 定义一个三元文法哈希值的函数,考虑两个前继元素
def triGramHash(sequence, t, buckets):
t1 = sequence[t - 1] if t - 1 >= 0 else 0
t2 = sequence[t - 2] if t - 2 >= 0 else 0
return (t2 * 14918087 * 18408749 + t1 * 14918087) % buckets
# 定义load_dataset,用于加载和处理数据集,接收文件路径path和可选参数pad_size——填充大小,默认32
def load_dataset(path, pad_size=32):
# 初始化空列表来存储处理后的数据
contents = []
# 逐行读取文件
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
# 每行制表符分隔为文本和标签两部分
content, label = lin.split('\t')
# words_line存储每行的words的索引
words_line = []
# 分词和分词后的长度
token = tokenizer(content)
seq_len = len(token)
# 如果长度不足要填充[PAD],长度超过要截断
if pad_size:
if len(token) < pad_size:
# 列表乘法,会创建一个长度是pad_size-len(token),元素全是PAD的列表,用extend添加
token.extend([PAD] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
# word to id,如果在词表中找不到就默认为UNK的索引
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
# fasttext ngram,读取config的n_gram的词表大小作为桶个数
buckets = config.n_gram_vocab
bigram = []
trigram = []
# ------ngram------
# 对每个索引位置,生成哈希值
for i in range(pad_size):
bigram.append(biGramHash(words_line, i, buckets))
trigram.append(triGramHash(words_line, i, buckets))
# -----------------
# 将处理后的序列,包括词索引列表,标签,序列长度,二元、三元文法哈希列表,作为一个元组添加到contents列表中
contents.append((words_line, int(label), seq_len, bigram, trigram))
return contents # [[...],x,x,[...],[...]]
train = load_dataset(config.train_path, config.pad_size)
dev = load_dataset(config.dev_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
# 返回词汇表,训练集,验证集和测试集的数据
return vocab, train, dev, test
# DatasetIterater用于迭代数据集,将数据分批加载
class DatasetIterater(object):
# 接收三个参数,batch_size——批次大小,batches——批次列表,device——设备
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
# 计算数据集可以被分成多少个完整的批次
self.n_batches = len(batches) // batch_size
self.residue = False # 记录batch数量是否为整数
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
# _to_tensor用于将数据转换为Pytorch张量
def _to_tensor(self, datas):
# xx = [xxx[2] for xxx in datas]
# indexx = np.argsort(xx)[::-1]
# datas = np.array(datas)[indexx]
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
bigram = torch.LongTensor([_[3] for _ in datas]).to(self.device)
trigram = torch.LongTensor([_[4] for _ in datas]).to(self.device)
# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
return (x, seq_len, bigram, trigram), y
def __next__(self):
# 如果存在剩余批次,则处理最后一个不完整的批次
if self.residue and self.index == self.n_batches:
# 根据index计算最后一个批次的起始位置,并获取剩余的所有数据
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
# 用_to_tensor转换为张量
batches = self._to_tensor(batches)
return batches
# 如果已经处理完所有的批次,重置索引并抛出'StopIteration'异常,表示迭代结束
# 否则,继续处理下一个完整的批次
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
# 返回迭代器自身,实现迭代协议
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, config.batch_size, config.device)
return iter
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
if __name__ == "__main__":
'''提取预训练词向量'''
vocab_dir = "./THUCNews/data/vocab.pkl"
pretrain_dir = "./THUCNews/data/sgns.sogou.char"
emb_dim = 300
filename_trimmed_dir = "./THUCNews/data/vocab.embedding.sougou"
word_to_id = pkl.load(open(vocab_dir, 'rb'))
embeddings = np.random.rand(len(word_to_id), emb_dim)
f = open(pretrain_dir, "r", encoding='UTF-8')
for i, line in enumerate(f.readlines()):
# if i == 0: # 若第一行是标题,则跳过
# continue
lin = line.strip().split(" ")
if lin[0] in word_to_id:
idx = word_to_id[lin[0]]
emb = [float(x) for x in lin[1:301]]
embeddings[idx] = np.asarray(emb, dtype='float32')
f.close()
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)
把模型权重初始化、训练、评估、测试部分放到了train_eval.py
里
# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from utils import get_time_dif
from tensorboardX import SummaryWriter
# 权重初始化,默认xavier,根据每层的神经元数量来自动计算初始化参数方差的方法
def init_network(model, method='xavier', exclude='embedding', seed=123):
# named_parameters返回一个包含参数名称和参数本身的迭代器
for name, w in model.named_parameters():
# 如果参数的名称中不包含exclude指定的字符串,默认为embedding,则对该参数进行初始化
if exclude not in name:
# 参数名称包含'weight',则认为是权重参数,根据指定初始化方法进行初始化
if 'weight' in name:
if method == 'xavier':
# xavier正态分布初始化
nn.init.xavier_normal_(w)
elif method == 'kaiming':
# kaiming正态分布初始化
nn.init.kaiming_normal_(w)
else:
# 标准正态分布初始化
nn.init.normal_(w)
elif 'bias' in name:
# 偏置初始化为常数0
nn.init.constant_(w, 0)
else:
# 其他参数不进行任何操作
pass
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
# 学习率指数衰减,每次epoch:学习率 = gamma * 学习率
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
# scheduler.step() # 学习率衰减
# 在迭代数据集那里的to_tensor返回的是(x, seq_len, bigram, trigram), y这种形式
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = F.cross_entropy(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels.data.cpu()
# outputs.data是输出张量的数据部分,每一行包含了对应于一个样本的类别分数
# torch.max(outputs.data, 1)沿第1维进行操作,返回每个样本中的最大值和索引,这里选取索引[1]
predic = torch.max(outputs.data, 1)[1].cpu()
# 准确率
train_acc = metrics.accuracy_score(true, predic)
# 用evaluate函数评估模型在验证集上的性能,返回验证集准确率和损失
dev_acc, dev_loss = evaluate(config, model, dev_iter)
# 更新最好的损失,保存模型的字典状态
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
# 将训练和验证的损失及准确率添加到TensorBoard
writer.add_scalar("loss/train", loss.item(), total_batch)
writer.add_scalar("loss/dev", dev_loss, total_batch)
writer.add_scalar("acc/train", train_acc, total_batch)
writer.add_scalar("acc/dev", dev_acc, total_batch)
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
writer.close()
test(config, model, test_iter)
def test(config, model, test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
def evaluate(config, model, data_iter, test=False):
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
confusion = metrics.confusion_matrix(labels_all, predict_all)
return acc, loss_total / len(data_iter), report, confusion
return acc, loss_total / len(data_iter)