TextCNN的复现

TextCNN的复现–pytorch的实现

对于TextCNN的讲解,可以参考这篇文章

Convolutional Neural Networks for Sentence Classification - 知乎 (zhihu.com)

接下来主要是对代码内容的详解,完整代码将在文章末尾给出。

使用的数据集为电影评论数据集,其中正面数据集5000条左右,负面的数据集也为5000条。

pyroch的基本训练过程:

加载训练集–构建模型–模型训练–模型评价

首先,是要对数据集进行加载,在对数据集加载时候需要继承一下Dataset类,代码如下

class Data_loader(Dataset):
    def __init__(self, file_pos, file_neg, model_path, word2_vec=False):
        self.file_pos = file_pos
        self.file_neg = file_neg
        if word2_vec:
            self.x_train, self.y_train = self.get_word2vec(model_path)
        else:
            self.x_train, self.y_train, self.dictionary = self.pre_process()

    def __getitem__(self, idx):
        data = self.x_train[idx]
        label = self.y_train[idx]
        data = torch.tensor(data)
        label = torch.tensor(label)
        return data, label

    def __len__(self):
        return len(self.x_train)

    def clean_sentences(self, string):
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()

    def load_data_and_labels(self):
        positive_examples = list(open(self.file_pos, "r", encoding="utf-8").readlines())
        positive_examples = [s.strip() for s in positive_examples]  # 对评论数据删除每一行数据的\t,\n
        negative_examples = list(open(self.file_neg, "r", encoding="utf-8").readlines())
        negative_examples = [s.strip() for s in negative_examples]  # 对评论数据删除每一行数据的\t,\n
        x_text = positive_examples + negative_examples
        x_text = [self.clean_sentences(_) for _ in x_text]
        positive_labels = [[1, 0] for _ in positive_examples]  # 正样本数据为1

        negative_labels = [[0, 1] for _ in negative_examples]  # 负样本数据为0
        y = np.concatenate([positive_labels, negative_labels], 0)
        return x_text, y  # 返回的是dataframe对象,[0]data[0]为文本数据,data[1]为标签

    def pre_process(self):
        '''
        加载数据,并对之前使用的数据进行打乱返回,同时根据训练集和测试集的比列进行划分,默认百分80和百分20
        :return:测试数据、训练数据、以及生成的词汇表
        '''
        x_data, y_label = self.load_data_and_labels()

        max_document_length = max(len(x.split(' ')) for x in x_data)
        voc = []
        word_split = []
        [voc.extend(x.split()) for x in x_data]  # 生成词典
        [word_split.append(x.split()) for x in x_data]
        if len(voc) != 0:
            ordere_dict = OrderedDict(sorted(Counter(_flatten(voc)).items(), key=lambda x: x[1], reverse=True))
            # 把文档映射成词汇的索引序列
            dictionary = vocab(ordere_dict)
            x_data = []
            for words in word_split:
                x = list(dictionary.lookup_indices(words))
                temp_pos = max_document_length - len(x)
                if temp_pos != 0:
                    for i in range(1, temp_pos + 1):
                        x.extend([0])
                x_data.append(x)
            x_data = np.array(x_data)
            np.random.seed(10)
            # 将标签打乱顺序,返回索引
            shuffle_indices = np.random.permutation(np.arange(len(y_label)))

            x_shuffled = x_data[shuffle_indices]
            y_shuffled = y_label[shuffle_indices]
            return x_shuffled, y_shuffled, dictionary

    def get_word2vec(self, model_path):
        model = gensim.models.Word2Vec.load(model_path)
        x_data, y_label = self.load_data_and_labels()
        word_split = []
        [word_split.append(x.split()) for x in x_data]
        sentence_vectors = []
        for sentence in word_split:
            sentence_vector = []
            for word in sentence:
                try:
                    v = model.wv.get_vector(word)
                except Exception as e:
                    v = np.zeros(shape=(model.vector_size,), dtype=np.float32)
                sentence_vector.append(v)
            sentence_vectors.append(sentence_vector)
        max_document_length = max(len(x) for x in sentence_vectors)
        for vector in sentence_vectors:
            for i in range(1, max_document_length - len(vector) + 1):
                v = np.zeros(shape=(model.vector_size,), dtype=np.float32)
                vector.append(v)
        vector_data = np.asarray(sentence_vectors, dtype=np.float32)
        np.random.seed(10)
        # 将标签打乱顺序,返回索引
        shuffle_indices = np.random.permutation(np.arange(len(y_label)))

        x_shuffled = vector_data[shuffle_indices]
        y_shuffled = y_label[shuffle_indices]
        return x_shuffled, y_shuffled

上述代码中的__init__ 、getitem 、len是必须要继承实现的方法,clean_sentence是对读取的数据进行清洗,load_data_and_label是加载数据且返回清洗过后的数据以及数据标签。pre_process是对数据进行编码,原始的数据是英文数据,因此需要对其进行分词、编码,最后返回的数据将是数字,一行数据就是一句评论。

例如:

I like this movie

在对其进行编码返回后将是 0 1 2 3,0对应的为I,1对应的为like以此类推。

get_word2vec则是使用word2vec预训练模型来对每个单词对应的数据内容进行映射。1个单词对应的将会是一个100维的矩阵,该维度可以根据自己训练word2vec模型时候自己进行调整。

接下来是word2vec模型的训练及保存,出于简便性,训练word2vec模型时候直接使用了该数据集对word2vec模型进行训练。

代码如下所示:

def get_model(p_file, n_file):
    x_data, y_label = load_data_and_labels(p_file, n_file)
    x_data = [x.split() for x in x_data]
    max_document_length = max(len(x) for x in x_data)
    model = Word2Vec(x_data, vector_size=256)
    return model

在这儿设置的每个词的维度是256维。

接下来就是TextCNN模型的构建

class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()

    def forward(self, x):
        return F.max_pool1d(x, kernel_size=x.shape[2])


class TextCNN(nn.Module):
    def __init__(self, num_classes, num_embeddings=-1, embedding_dim=512, kernel_size=[3, 4, 5, 6],
                 num_channels=[32, 32, 32, 32], embeddings_pretrained=None):
        super(TextCNN, self).__init__()
        self.num_classes = num_classes
        self.num_embeddings = num_embeddings
        if self.num_embeddings > 0:
            self.embedding = nn.Embedding(num_embeddings, embedding_dim)
            if embeddings_pretrained is not None:
                self.embedding = self.embedding.from_pretrained(embeddings_pretrained, freeze=False)
        self.cnn_layers = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_size):
            cnn = nn.Sequential(nn.Conv1d(in_channels=embedding_dim, out_channels=c, kernel_size=k),
                                nn.BatchNorm1d(c),
                                nn.ReLU(inplace=True)
                                )
            # cnn = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=c, kernel_size=k),
            #                     nn.BatchNorm1d(c),
            #                     nn.ReLU(inplace=True)
            #                     )
            self.cnn_layers.append(cnn)

        self.pool = GlobalMaxPool1d()
        self.classify = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(sum(num_channels), self.num_classes)
        )

    def forward(self, x):
        if self.num_embeddings > 0:
            x = self.embedding(x)
        # input = torch.unsqueeze(x, dim=1)
        # print(input.size())
        input = x.permute(0, 2, 1)
        # print(input.size())
        # print(len(input[0]))
        y = []
        for layer in self.cnn_layers:
            x = layer(input)
            x = self.pool(x).squeeze(-1)
            y.append(x)
        # print(y)
        y = torch.cat(y, dim=1)
        out = self.classify(y)
        # out = torch.sigmoid(out)
        return out

在构建模型时候需要继承nn.moudule,同时要实现__init__、以及forward方法,可以看作init在定义各个层,forward在对各个层之间来进行连接。

接下来就是对模型进行训练,代码如下所示:

batch_size = 832
num_classes = 2
file_pos = 'E:\\PostGraduate\\Paper_review\\pytorch_TextCnn/data/rt-polarity.pos'
file_neg = 'E:\\PostGraduate\\Paper_review\\pytorch_TextCnn/data/rt-polarity.neg'
word2vec_path = 'E:\\PostGraduate\\Paper_review\\pytorch_TextCnn/word2vec1.model'
train_data = Data_loader(file_pos, file_neg, word2vec_path)
train_size = int(len(train_data) * 0.8)
test_size = len(train_data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(train_data, [train_size, test_size])
train_iter = DataLoader(train_dataset, batch_size=830, shuffle=True)
test_iter = DataLoader(test_dataset, batch_size=2133, shuffle=True)
model = TextCNN(num_classes, embeddings_pretrained=True)
# model = TextCNN(num_classes, num_embeddings=18764)
# 开始训练
epoch = 100  # 训练轮次
optmizer = torch.optim.Adam(model.parameters(), lr=0.01)
# optmizer = torch.optim.SGD(model.parameters(),lr=0.01,momentum=0.4)
train_losses = []
train_counter = []
test_losses = []
log_interval = 5
test_counter = [i * len(train_iter.dataset) for i in range(epoch + 1)]
device = 'cpu'


def train_loop(n_epochs, optimizer, model, train_loader, device, test_iter):
    for epoch in range(1, n_epochs + 1):
        print("开始第{}轮训练".format(epoch))
        model.train()
        correct = 0
        for i, data in enumerate(train_loader):
            optimizer.zero_grad()
            (text_data, label) = data
            text_data = text_data.to(device)
            label = label.to(device)
            label = label.long()
            output = model(text_data)
            loss_func = nn.BCEWithLogitsLoss()
            # output = output.long()
            loss = loss_func(output, label.float())
            loss.backward()
            optimizer.step()
            pred = output.data.max(1, keepdim=True)[1]
            label = label.data.max(1, keepdim=True)[1]
            correct += pred.eq(label.data.view_as(pred)).sum()
            if i % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, i * len(text_data), len(train_loader.dataset),
                           100. * i / len(train_loader), loss.item()))
                train_losses.append(loss.item())
                train_counter.append(
                    (i * 64) + ((epoch - 1) * len(train_loader.dataset)))
                torch.save(model.state_dict(), './model.pth')
                torch.save(optimizer.state_dict(), './optimizer.pth')
        print("Accuracy: {}/{} ({:.0f}%)\n".format(correct, len(train_loader.dataset),
                                                   100. * correct / len(train_loader.dataset)))
        print("开始第{}轮评价".format(epoch))
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_iter:
                # for data, target in train_iter:
                data = data.to(device)
                target = target.to(device)
                output = model(data)
                loss_func = nn.BCEWithLogitsLoss()
                # output = output.long()
                loss = loss_func(output, target.float())
                test_loss += loss
                pred = output.data.max(1, keepdim=True)[1]
                label = target.data.max(1, keepdim=True)[1]
                correct += pred.eq(label.data.view_as(pred)).sum()
        test_loss /= len(test_iter.dataset)
        # test_loss /= len(train_iter.dataset)
        test_losses.append(test_loss)
        print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_iter.dataset),
            100. * correct / len(test_iter.dataset)))


train_loop(epoch, optmizer, model, train_iter, device, test_iter)

在上述中,首先会对数据集加载进来,然后分为80%的训练集和20%的测试集,定义使用的优化器为adam。同时在训练的过程中会对优化器、损失函数等信息进行保存。

训练结果如下所示:

完整代码链接

t, len(test_iter.dataset),
100. * correct / len(test_iter.dataset)))

train_loop(epoch, optmizer, model, train_iter, device, test_iter)


在上述中,首先会对数据集加载进来,然后分为80%的训练集和20%的测试集,定义使用的优化器为adam。同时在训练的过程中会对优化器、损失函数等信息进行保存。

训练结果为75%左右。



完整代码链接

[木南/TextCNN (gitee.com)](https://gitee.com/nanwang-crea/text-cnn)

你可能感兴趣的:(cnn,pytorch,分类)