项目4-语句分类

项目4-语句分类

项目描述

  • 本次作业是要让同学接触自然语言处理当中一个简单的任务 —— 语句分类(文本分类)
  • 给定一个语句,判断他有没有恶意(负面标 1,正面标 0)

数据集介绍

有三个文件,分别是 training_label.txt、training_nolabel.txt、testing_data.txt

  • training_label.txt:有标签的训练数据(句子配上 0 or 1,+++$+++ 只是分隔符号,不要理它)

  • e.g., 1 +++$+++ are wtf … awww thanks !

  • training_nolabel.txt:没有标签的训练数据(只有句子),用来做半监督学习

  • ex: hates being this burnt !! ouch

  • testing_data.txt:你要判断测试数据里面的句子是 0 or 1

    id,text

    0,my dog ate our dinner . no , seriously … he ate it .

    1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry

    2,stupid boys … they ’ re so … stupid !

项目要求

  • 用一些方法 pretrain 出 word embedding (e.g., skip-gram, CBOW. )
  • 请使用 RNN 实现文本分类
  • 不能使用额外 data (禁止使用其他 corpus 或 pretrained model)

数据准备

环境配置/安装

!pip install gensim==3.3.0
path_prefix = "./"
# 用来过滤警告
import warnings
warnings.filterwarnings('ignore')

Utils

# utils.py
# 这个块用来先定义一些等等常用到的函数
import paddle
import numpy as np
import pandas as pd
paddle.disable_static()

def load_training_data(path='training_label.txt'):
    # 把训练时需要的数据读进来
    # 如果是 'training_label.txt',需要读取标签,如果是 'training_nolabel.txt',不需要读取标签
    if 'training_label' in path:
        with open(path, 'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='testing_data'):
    # 把测试时需要的数据读进来
    with open(path, 'r') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    print("X", X)
    return X

def evaluation(outputs, labels):
    # outputs => probability (float)
    # labels => labels

    outputs = paddle.to_tensor([1.0 if element>=0.5 else 0.0 for element in outputs])
    labels = labels.squeeze(1)
    correct = paddle.sum(paddle.cast(paddle.equal(outputs, labels), dtype="int64")).numpy()
    return correct

训练word to vector

# w2v.py
# 这个块是用来训练 word to vector 的词向量
# 注意!这个块在训练 word to vector 时是用 cpu,可能要花到 10 分钟以上
import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec
from gensim.models import Word2Vec

def train_word2vec(x):
    # 训练word to vector的词向量,iter=10即训练10轮
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    return model

if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('work/rnn_data/training_label.txt')
    train_x_no_label = load_training_data('work/rnn_data/training_nolabel.txt')

    print("loading testing data ...")
    test_x = load_testing_data('work/rnn_data/testing_data.txt')

    #model = train_word2vec(train_x + train_x_no_label + test_x)
    model = train_word2vec(train_x + test_x)
    
    print("saving model ...")
    # model.save(os.path.join(path_prefix, 'model/w2v_all.model'))
    model.save(os.path.join(path_prefix, 'w2v_all.model'))
loading training data ...

Data Preprocess

# preprocess.py
# 这个块用来做数据的预处理
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    def get_w2v_model(self):
        # 把之前训练好的 word to vec 模型读进来
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word):
        # 把词加进 embedding,并赋予他一个随机生成的表示向量
        # 词只会是 "" 或 ""
        vector = np.random.uniform(size=(1, self.embedding_dim))
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = np.concatenate([self.embedding_matrix, vector], 0)
    def make_embedding(self, load=True):
        print("Get embedding ...")
        # 取得训练好的 Word2vec词向量
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        # 制作一个 word2idx 的 字典
        # 制作一个 idx2word 的 列表
        # 制作一个 word2vector 的 列表
        for i, word in enumerate(self.embedding.wv.vocab):
            # print('get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['he'] = 1 
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        # print('')
#        self.embedding_matrix = paddle.to_tensor(self.embedding_matrix)
        self.embedding_matrix = np.array(self.embedding_matrix)
        # 将 "" 跟 "" 加进 embedding 里面
        self.add_embedding("")
        self.add_embedding("")
        # print("total words: {}".format(len(self.embedding_matrix)))
        self.embedding_matrix = self.embedding_matrix.astype(np.float32)
        return self.embedding_matrix
    def pad_sequence(self, sentence):
        # 将每个句子变成一样的长度
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx[""])
        assert len(sentence) == self.sen_len
        return sentence
    def sentence_word2idx(self):
        # 把句子里面的字转成相对应的索引
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            # print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx[""])
            # 将每个句子变成一样的长度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return paddle.to_tensor(sentence_list)
    
    def labels_to_tensor(self, y):
        # 把标签转成张量
        y = [float(label) for label in y]
        return paddle.to_tensor(y)

数据集

# data.py
# 实现了dataset所需要的 '__init__', '__getitem__', '__len__'
# 好让 dataloader 能使用
import paddle
from paddle.io import Dataset

class TwitterDataset(Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: 
            return self.data[idx], paddle.to_tensor([1.])
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

模型

# model.py
# 这个块是要拿来训练的模型

import paddle.nn as nn

class LSTM_Net(nn.Layer):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # 制作 embedding layer
        # self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        # self.embedding.weight = torch.nn.Parameter(embedding)
#         if fix_embedding:
#             w_param_attrs = paddle.ParamAttr(trainable=False)
#         else:
#             w_param_attrs = paddle.ParamAttr(trainable=True)
#         self.embedding = nn.Embedding((embedding.shape[0],embedding.shape[1]), param_attr= w_param_attrs)
        self.embedding = nn.Embedding(embedding.shape[0],embedding.shape[1], sparse=True)
        self.embedding.weight.set_value(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )

    def forward(self, inputs):
        inputs = self.embedding(inputs)
#         print("embedding",inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 维度 (batch, seq_len, hidden_size)
        # 取用 LSTM 最后一层的隐藏状态
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

Train

# train.py
# 这个块是用来训练模型的

def training(batch_size, n_epoch, lr, model_dir, train, valid, model):
    model.train() # 将模型的模式设为 train,这样优化器就可以更新模型的参数
    criterion = paddle.nn.loss.BCELoss() # 定义损失函数,这裡我们使用 二元交叉熵损失
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=model.parameters()) # 将模型的参数给优化器,并给予适当的学习率
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        # 这段做训练
        for i, (inputs, labels) in enumerate(train):
            optimizer.clear_grad() # 由于 loss.backward() 的梯度会累加,所以每次喂完一个 batch 后需要归零
            outputs = model(inputs) # 将输入喂给模型
            loss = criterion(outputs, labels) # 计算此时模型的训练损失
            loss.backward() # 算损失的梯度
            optimizer.step() # 更新训练模型的参数
            correct = evaluation(outputs, labels) # 计算此时模型的训练准确率

            total_acc += (correct / batch_size)
            total_loss += loss.numpy()
            
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.numpy()[0], correct[0]*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss[0]/t_batch, total_acc[0]/t_batch*100))

        # 这段做验证
        model.eval() # 将模型的模式设为eval,这样模型的参数就会固定住
#         with torch.no_grad():
        total_loss, total_acc = 0, 0
        for i, (inputs, labels) in enumerate(valid):
            outputs = model(inputs) # 将输入喂给模型

            loss = criterion(outputs, labels) # 计算此时模型的验证损失
            correct = evaluation(outputs, labels) # 计算此时模型的验证准确率
            total_acc += (correct / batch_size)
            total_loss += loss.numpy()

        print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss[0]/v_batch, total_acc[0]/v_batch*100))
        if total_acc > best_acc:
            # 如果验证的结果优于之前所有的结果,就把当下的模型存下来以备之后做预测时使用
            best_acc = total_acc
            #torch.save(model, "{}/val_acc_{:.3f}.model".format(model_dir,total_acc/v_batch*100))
            paddle.save(model.state_dict(), "lstm_crf.pdparams")
            paddle.save(optimizer.state_dict(),  "lstm_crf.pdopt")
            print('saving model with acc {:.3f}'.format(total_acc[0]/v_batch*100))
        print('-----------------------------------------------')
        model.train() # 将模型的模式设为 train,这样优化器就可以更新模型的参数(因为刚刚转成 eval 模式)

测试

def testing(batch_size, test_loader, model):
    model.eval()
    ret_output = []
    with paddle.no_grad():
        for i, (inputs,labels) in enumerate(test_loader):
            outputs = model(inputs)
            outputs = [1 if element>=0.5 else 0 for element in outputs]
            ret_output += outputs
    return ret_output

主函数

# main.py
import os
import argparse
import numpy as np
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from paddle.io import BatchSampler, DataLoader


# 处理好各个数据的路径
train_with_label = os.path.join(path_prefix, 'work/rnn_data/training_label.txt')
train_no_label = os.path.join(path_prefix, 'work/rnn_data/training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'work/rnn_data/testing_data.txt')

w2v_path = os.path.join(path_prefix, 'w2v_all.model') # 处理 word to vec model 的路径

# 定义句子长度、要不要固定 embedding、批次大小、要训练几个 epoch、学习率的值、模型的资料夹路径
sen_len = 20
fix_embedding = True # 保持训练时的嵌入不变
batch_size = 128
epoch = 5
lr = 0.001
# model_dir = os.path.join(path_prefix, 'model/') # 检查点模型的模型目录
model_dir = path_prefix #  检查点模型的模型目录

print("loading data ...") # 把 'training_label.txt' 跟 'training_nolabel.txt' 读进来
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)

# 对 输入 跟 标签 做预处理
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

# 制作一个模型的对象
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)

# 把 数据 分为 训练数据 跟 验证数据
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

# 把 数据 做成 dataset 供 dataloader 取用
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# 把 数据 转成 batch of tensors
train_loader = DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            places = paddle.CPUPlace())

val_loader = DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            places = paddle.CPUPlace())

# 开始训练
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model)

Predict and Write to csv file

# 开始测试模型并做预测
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
# print("test_x", test_x[0])
test_dataset = TwitterDataset(X=test_x, y=None)
# print("test_dataset", test_dataset[0])
test_loader = DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = False,
                                            places = paddle.CPUPlace())

print('\nload model ...')

param_state_dict = paddle.load("lstm_crf.pdparams")
opt_state_dict = paddle.load("lstm_crf.pdopt")
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model.set_state_dict(param_state_dict)
model.set_state_dict(opt_state_dict)

optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=model.parameters()) # 将模型的参数给优化器,并给予适当的学习率
optimizer.set_state_dict(opt_state_dict)

outputs = testing(batch_size, test_loader, model)

# 写到 csv 档案供上传 Kaggle
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")

# 以下是使用命令行上传到 Kaggle 的方式
# 需要先 pip install kaggle、Create API Token,详细请看 https://github.com/Kaggle/kaggle-api 以及 https://www.kaggle.com/code1110/how-to-submit-from-google-colab
# kaggle competitions submit [competition-name] -f [csv file path]] -m [message]
# e.g., kaggle competitions submit ml-2020spring-hw4 -f output/predict.csv -m "......"

检查文件在哪

!pwd
!ls

你可能感兴趣的:(CH4-李宏毅机器学习,分类,python,数据挖掘)