从Word2vec中导入训练好的词向量

#!/usr/bin/env python
# -*- coding:utf-8 -*- 
# Author: Jia ShiLin
import nltk
from gensim.models import Word2Vec
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalAveragePooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from nltk import pad_sequence
from sklearn.model_selection import train_test_split
import collections

INPUT_FILE = 'umich-sentiment-train.txt'
WORD2VEC_MODEL = 'GoogleNews-vectors-negative300.bin.gz'
VOCAB_SIZE = 5000
EMBED_SIZE = 300
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 10

# step1 #从数据集提取词汇,创建常用字典,
# step2 #解析数据集,用空白补足词列表
# step3 #把标签转换成类别格式
# step4 #数据集划分
counter = collections.Counter()
fin = open(INPUT_FILE, 'rb')
maxlen = 0
for line in fin:
    _, sent = line.strip().split('t')
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)

    for word in words:
        counter[word] += 1
    fin.close()

    word2index = collections.defaultdict(int)
    for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
        word2index[word[0]] = wid + 1
    vocab_sz = len(word2index) + 1
    index2word = {v: k for k, v in word2index.items()}

    xs, ys = [], []
    fin = open(INPUT_FILE, 'rb')
    for line in fin:
        label, sent = line.strip().split("t")
        ys.append(int(label))
        words = [x.lower() for x in nltk.word_tokenize(sent)]
        wids = [word2index[word] for word in words]
        xs.append(wids)

    fin.close()

    X = pad_sequence(xs, maxlen=maxlen)
    Y = np_utils.to_categorical(ys)

    # 最后,按照70:30 的比例划分训练集合测试集,
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=seed)

# 从预训练好的模型中加载word2vec,为字典中的词查找对应的词向量,并将其写入权重矩阵embedding_weights
# 权重矩阵的行数为字典的词数,每行为该次的词向量代表
word2vec = Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True)
embedding_weights = np.zeros((vocab_sz, EMBED_SIZE))
for word ,index in word2index.items():
    try:
        embedding_weights[index,:]=word2index[word]
    except:
        pass

#model
model =Sequential()
model.add(Embedding(vocab_sz,EMBED_SIZE,input_length=maxlen,weights=[embedding_weights]))
model.add(SpatialDropout1D(Dropout(0.2)))
model.add(Conv1D(filters=NUM_FILTERS,kernel_size=NUM_WORDS,activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(2,activation='softmax'))


#compile
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history =model.fit(Xtrain,Ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=(Xtest,Ytest))

#evaluate
score = model.evaluate(Xtest,Ytest,verbose=1)
print('test score {:.3f},accuracy:{.3f}'.format(score[0],score[1]))



 

你可能感兴趣的:(Keras)