NLP实战入门——文本分类任务(TextRNN,TextCNN,TextRNN_Att,TextRCNN,FastText,DPCNN,BERT,ERNIE)

本文参考自https://github.com/649453932/Chinese-Text-Classification-Pytorch?tab=readme-ov-file,https://github.com/leerumor/nlp_tutorial?tab=readme-ov-file,https://zhuanlan.zhihu.com/p/73176084,是为了进行NLP的一些典型模型的总结和尝试。

中文数据集

从THUCNews中抽取了20万条新闻标题,文本长度在20到30之间。一共10个类别,每类2万条。

以字为单位输入模型,使用了预训练词向量:搜狗新闻 Word+Character 300d。

类别:财经、房产、股票、教育、科技、社会、时政、体育、游戏、娱乐。

数据处理

共有两个数据处理脚本utils.pyutils_fasttext.py,这里以utils_fasttext.py为例,其中结合了n-gram信息。以下是代码,添加了注释。

# coding: UTF-8
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta

# 最大词表大小
MAX_VOCAB_SIZE = 10000
# 将UNK,PAD定义为变量,在字典中操作的时候就不需要加引号了
UNK, PAD = '', ''


# build_vocab接受四个参数:file_path是文本文件的路径,tokenizer是一个分词函数,
# max_size是词汇表最大大小,min_freq是词汇表中词或字符的最小频率
# return一个词汇表字典,键是词,值是索引,这个字典用于将文本转换为数值序列
def build_vocab(file_path, tokenizer, max_size, min_freq):
    # 初始化词汇表字典
    vocab_dic = {
   }
    # 读取文件
    with open(file_path, 'r', encoding='UTF-8') as f:
        # 用tqdm库迭代文件中的每一行,并显示进度条
        for line in tqdm(f):
            # 去除每行文本前后的空白字符
            lin = line.strip()
            if not lin:
                continue
            # 每行文本用制表符分隔,取第一部分内容
            content = lin.split('\t')[0]
            # 分词器进行分词
            for word in tokenizer(content):
                # 对每个词更新其在字典中的计数,如果不在就默认为0,然后加1
                vocab_dic[word] = vocab_dic.get(word, 0) + 1

        # 把字典中的项按照频率降序排列,筛选出频率大于min_freq的词,然后取前max_size个词
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
        # 将排序后的词汇列表vocab_list的词作为键,其索引作为值
        vocab_dic = {
   word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        # 在词汇表中添加特殊标记UNK——未知词,PAD——填充词
        vocab_dic.update({
   UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
    return vocab_dic

# build_dataset接收两个参数,config配置信息,use_word决定是否词级分词
def build_dataset(config, ues_word):
    # 如果是词级分词,那么用空格分隔,否则都变成char
    if ues_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level
    # 检查vocab_path是否存在词表,如果有则加载这个文件,如果没有就用build_vocab创建词表并序列化保存
    if os.path.exists(config.vocab_path):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:
        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(vocab, open(config.vocab_path, 'wb'))
    print(f"Vocab size: {
     len(vocab)}")

    # 定义一个生成二元文法哈希值的函数,接收序列,位置t和桶的数量buckets
    def biGramHash(sequence, t, buckets):
        t1 = sequence[t - 1] if t - 1 >= 0 else 0
        return (t1 * 14918087) % buckets

    # 定义一个三元文法哈希值的函数,考虑两个前继元素
    def triGramHash(sequence, t, buckets):
        t1 = sequence[t - 1] if t - 1 >= 0 else 0
        t2 = sequence[t - 2] if t - 2 >= 0 else 0
        return (t2 * 14918087 * 18408749 + t1 * 14918087) % buckets

    # 定义load_dataset,用于加载和处理数据集,接收文件路径path和可选参数pad_size——填充大小,默认32
    def load_dataset(path, pad_size=32):
        # 初始化空列表来存储处理后的数据
        contents = []
        # 逐行读取文件
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                # 每行制表符分隔为文本和标签两部分
                content, label = lin.split('\t')
                # words_line存储每行的words的索引
                words_line = []
                # 分词和分词后的长度
                token = tokenizer(content)
                seq_len = len(token)
                # 如果长度不足要填充[PAD],长度超过要截断
                if pad_size:
                    if len(token) < pad_size:
                        # 列表乘法,会创建一个长度是pad_size-len(token),元素全是PAD的列表,用extend添加
                        token.extend([PAD] * (pad_size - len(token)))
                    else:
                        token = token[:pad_size]
                        seq_len = pad_size
                # word to id,如果在词表中找不到就默认为UNK的索引
                for word in token:
                    words_line.append(vocab.get(word, vocab.get(UNK)))

                # fasttext ngram,读取config的n_gram的词表大小作为桶个数
                buckets = config.n_gram_vocab
                bigram = []
                trigram = []
                # ------ngram------
                # 对每个索引位置,生成哈希值
                for i in range(pad_size):
                    bigram.append(biGramHash(words_line, i, buckets))
                    trigram.append(triGramHash(words_line, i, buckets))
                # -----------------
                # 将处理后的序列,包括词索引列表,标签,序列长度,二元、三元文法哈希列表,作为一个元组添加到contents列表中
                contents.append((words_line, int(label), seq_len, bigram, trigram))
        return contents  # [[...],x,x,[...],[...]]
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    # 返回词汇表,训练集,验证集和测试集的数据
    return vocab, train, dev, test

# DatasetIterater用于迭代数据集,将数据分批加载
class DatasetIterater(object):
    # 接收三个参数,batch_size——批次大小,batches——批次列表,device——设备
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        # 计算数据集可以被分成多少个完整的批次
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数 
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    # _to_tensor用于将数据转换为Pytorch张量
    def _to_tensor(self, datas):
        # xx = [xxx[2] for xxx in datas]
        # indexx = np.argsort(xx)[::-1]
        # datas = np.array(datas)[indexx]
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        bigram = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        trigram = torch.LongTensor([_[4] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len, bigram, trigram), y

    def __next__(self):
        # 如果存在剩余批次,则处理最后一个不完整的批次
        if self.residue and self.index == self.n_batches:
            # 根据index计算最后一个批次的起始位置,并获取剩余的所有数据
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            # 用_to_tensor转换为张量
            batches = self._to_tensor(batches)
            return batches

        # 如果已经处理完所有的批次,重置索引并抛出'StopIteration'异常,表示迭代结束
        # 否则,继续处理下一个完整的批次
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    # 返回迭代器自身,实现迭代协议
    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

if __name__ == "__main__":
    '''提取预训练词向量'''
    vocab_dir = "./THUCNews/data/vocab.pkl"
    pretrain_dir = "./THUCNews/data/sgns.sogou.char"
    emb_dim = 300
    filename_trimmed_dir = "./THUCNews/data/vocab.embedding.sougou"
    word_to_id = pkl.load(open(vocab_dir, 'rb'))
    embeddings = np.random.rand(len(word_to_id), emb_dim)
    f = open(pretrain_dir, "r", encoding='UTF-8')
    for i, line in enumerate(f.readlines()):
        # if i == 0:  # 若第一行是标题,则跳过
        #     continue
        lin = line.strip().split(" ")
        if lin[0] in word_to_id:
            idx = word_to_id[lin[0]]
            emb = [float(x) for x in lin[1:301]]
            embeddings[idx] = np.asarray(emb, dtype='float32')
    f.close()
    np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

训练脚本

把模型权重初始化、训练、评估、测试部分放到了train_eval.py

# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from utils import get_time_dif
from tensorboardX import SummaryWriter


# 权重初始化,默认xavier,根据每层的神经元数量来自动计算初始化参数方差的方法
def init_network(model, method='xavier', exclude='embedding', seed=123):
    # named_parameters返回一个包含参数名称和参数本身的迭代器
    for name, w in model.named_parameters():
        # 如果参数的名称中不包含exclude指定的字符串,默认为embedding,则对该参数进行初始化
        if exclude not in name:
            # 参数名称包含'weight',则认为是权重参数,根据指定初始化方法进行初始化
            if 'weight' in name:
                if method == 'xavier':
                    # xavier正态分布初始化
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    # kaiming正态分布初始化
                    nn.init.kaiming_normal_(w)
                else:
                    # 标准正态分布初始化
                    nn.init.normal_(w)
            elif 'bias' in name:
                # 偏置初始化为常数0
                nn.init.constant_(w, 0)
            else:
                # 其他参数不进行任何操作
                pass


def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # 学习率指数衰减,每次epoch:学习率 = gamma * 学习率
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        # scheduler.step() # 学习率衰减
        # 在迭代数据集那里的to_tensor返回的是(x, seq_len, bigram, trigram), y这种形式
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                # outputs.data是输出张量的数据部分,每一行包含了对应于一个样本的类别分数
                # torch.max(outputs.data, 1)沿第1维进行操作,返回每个样本中的最大值和索引,这里选取索引[1]
                predic = torch.max(outputs.data, 1)[1].cpu()
                # 准确率
                train_acc = metrics.accuracy_score(true, predic)
                # 用evaluate函数评估模型在验证集上的性能,返回验证集准确率和损失
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                # 更新最好的损失,保存模型的字典状态
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                # 将训练和验证的损失及准确率添加到TensorBoard
                writer.add_scalar("loss/train", loss.item(), total_batch)
                writer.add_scalar("loss/dev", dev_loss, total_batch)
                writer.add_scalar("acc/train", train_acc, total_batch)
                writer.add_scalar("acc/dev", dev_acc, total_batch)
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    writer.close()
    test(config, model, test_iter)


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

你可能感兴趣的:(自然语言处理,自然语言处理,分类,bert,人工智能,数据挖掘)