00 reader.py
"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import tensorflow as tf
def _read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
return f.read().decode("utf-8").replace("\n", "").split()
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def ptb_raw_data(data_path=None):
"""Load PTB raw data from data directory "data_path".
Reads PTB text files, converts strings to integer ids,
and performs mini-batching of the inputs.
The PTB dataset comes from Tomas Mikolov's webpage:
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Args:
data_path: string path to the directory where simple-examples.tgz has
been extracted.
Returns:
tuple (train_data, valid_data, test_data, vocabulary)
where each of the data objects can be passed to PTBIterator.
"""
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")
word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
return train_data, valid_data, test_data, vocabulary
def ptb_producer(raw_data, batch_size, num_steps, name=None):
"""Iterate on the raw PTB data.
This chunks up raw_data into batches of examples and returns Tensors that
are drawn from these batches.
Args:
raw_data: one of the raw data outputs from ptb_raw_data.
batch_size: int, the batch size.
num_steps: int, the number of unrolls.
name: the name of this operation (optional).
Returns:
A pair of Tensors, each shaped [batch_size, num_steps]. The second element
of the tuple is the same data time-shifted to the right by one.
Raises:
tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
"""
with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
data_len = tf.size(raw_data)
batch_len = data_len // batch_size
data = tf.reshape(raw_data[0 : batch_size * batch_len],
[batch_size, batch_len])
epoch_size = (batch_len - 1) // num_steps
assertion = tf.assert_positive(
epoch_size,
message="epoch_size == 0, decrease batch_size or num_steps")
with tf.control_dependencies([assertion]):
epoch_size = tf.identity(epoch_size, name="epoch_size")
i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
x = tf.strided_slice(data, [0, i * num_steps],
[batch_size, (i + 1) * num_steps])
x.set_shape([batch_size, num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1],
[batch_size, (i + 1) * num_steps + 1])
y.set_shape([batch_size, num_steps])
return x, y
01 PTB数据集介绍
import tensorflow as tf
import reader
DATA_PATH = "../../datasets/PTB_data"
train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)
print(len(train_data))
print(train_data[:100])
'''
929589
[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 2, 9256, 1, 3, 72, 393, 33, 2133, 0, 146, 19, 6, 9207, 276, 407, 3, 2, 23, 1, 13, 141, 4, 1, 5465, 0, 3081, 1596, 96, 2, 7682, 1, 3, 72, 393, 8, 337, 141, 4, 2477, 657, 2170, 955, 24, 521, 6, 9207, 276, 4, 39, 303, 438, 3684, 2, 6, 942, 4, 3150, 496, 263, 5, 138, 6092, 4241, 6036, 30, 988, 6, 241, 760, 4, 1015, 2786, 211, 6, 96, 4]
'''
result = reader.ptb_producer(train_data, 4, 5)
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(3):
x, y = sess.run(result)
print("X%d: "%i, x)
print("Y%d: "%i, y)
coord.request_stop()
coord.join(threads)
'''
X0: [[9970 9971 9972 9974 9975]
[ 332 7147 328 1452 8595]
[1969 0 98 89 2254]
[ 3 3 2 14 24]]
Y0: [[9971 9972 9974 9975 9976]
[7147 328 1452 8595 59]
[ 0 98 89 2254 0]
[ 3 2 14 24 198]]
X1: [[9976 9980 9981 9982 9983]
[ 59 1569 105 2231 1]
[ 0 312 1641 4 1063]
[ 198 150 2262 10 0]]
Y1: [[9980 9981 9982 9983 9984]
[1569 105 2231 1 895]
[ 312 1641 4 1063 8]
[ 150 2262 10 0 507]]
X2: [[9984 9986 9987 9988 9989]
[ 895 1 5574 4 618]
[ 8 713 0 264 820]
[ 507 74 2619 0 1]]
Y2: [[9986 9987 9988 9989 9991]
[ 1 5574 4 618 2]
[ 713 0 264 820 2]
[ 74 2619 0 1 8]]
'''
02 使用循环神经网络实现语言模型
# 《TensorFlow实战Google深度学习框架》08 循环神经网络
# win10 Tensorflow1.0.1 python3.5.3
# CUDA v8.0 cudnn-8.0-windows10-x64-v5.1
# filename:ts08.03.py # 使用循环神经网络实现语言模型
import numpy as np
import tensorflow as tf
import reader
# 1. 定义相关的参数
DATA_PATH = "../../datasets/PTB_data"
HIDDEN_SIZE = 200
NUM_LAYERS = 2
VOCAB_SIZE = 10000
LEARNING_RATE = 1.0
TRAIN_BATCH_SIZE = 20
TRAIN_NUM_STEP = 35
EVAL_BATCH_SIZE = 1
EVAL_NUM_STEP = 1
NUM_EPOCH = 2
KEEP_PROB = 0.5
MAX_GRAD_NORM = 5
# 2. 定义一个类来描述模型结构
class PTBModel(object):
def __init__(self, is_training, batch_size, num_steps):
self.batch_size = batch_size
self.num_steps = num_steps
# 定义输入层。
self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
# 定义使用LSTM结构及训练时使用dropout。
lstm_cell = tf.contrib.rnn.BasicLSTMCell(HIDDEN_SIZE)
if is_training:
lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=KEEP_PROB)
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * NUM_LAYERS)
# 初始化最初的状态。
self.initial_state = cell.zero_state(batch_size, tf.float32)
embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE])
# 将原本单词ID转为单词向量。
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
if is_training:
inputs = tf.nn.dropout(inputs, KEEP_PROB)
# 定义输出列表。
outputs = []
state = self.initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
cell_output, state = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
output = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])
weight = tf.get_variable("weight", [HIDDEN_SIZE, VOCAB_SIZE])
bias = tf.get_variable("bias", [VOCAB_SIZE])
logits = tf.matmul(output, weight) + bias
# 定义交叉熵损失函数和平均损失。
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
[logits],
[tf.reshape(self.targets, [-1])],
[tf.ones([batch_size * num_steps], dtype=tf.float32)])
self.cost = tf.reduce_sum(loss) / batch_size
self.final_state = state
# 只在训练模型时定义反向传播操作。
if not is_training: return
trainable_variables = tf.trainable_variables()
# 控制梯度大小,定义优化方法和训练步骤。
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM)
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables))
# 3. 使用给定的模型model在数据data上运行train_op并返回在全部数据上的perplexity值
def run_epoch(session, model, data, train_op, output_log, epoch_size):
total_costs = 0.0
iters = 0
state = session.run(model.initial_state)
# 训练一个epoch。
for step in range(epoch_size):
x, y = session.run(data)
cost, state, _ = session.run([model.cost, model.final_state, train_op],
{model.input_data: x, model.targets: y, model.initial_state: state})
total_costs += cost
iters += model.num_steps
if output_log and step % 100 == 0:
print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))
return np.exp(total_costs / iters)
# 4. 定义主函数并执行
def main():
train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)
# 计算一个epoch需要训练的次数
train_data_len = len(train_data)
train_batch_len = train_data_len // TRAIN_BATCH_SIZE
train_epoch_size = (train_batch_len - 1) // TRAIN_NUM_STEP
valid_data_len = len(valid_data)
valid_batch_len = valid_data_len // EVAL_BATCH_SIZE
valid_epoch_size = (valid_batch_len - 1) // EVAL_NUM_STEP
test_data_len = len(test_data)
test_batch_len = test_data_len // EVAL_BATCH_SIZE
test_epoch_size = (test_batch_len - 1) // EVAL_NUM_STEP
initializer = tf.random_uniform_initializer(-0.05, 0.05)
with tf.variable_scope("language_model", reuse=None, initializer=initializer):
train_model = PTBModel(True, TRAIN_BATCH_SIZE, TRAIN_NUM_STEP)
with tf.variable_scope("language_model", reuse=True, initializer=initializer):
eval_model = PTBModel(False, EVAL_BATCH_SIZE, EVAL_NUM_STEP)
# 训练模型。
with tf.Session() as session:
tf.global_variables_initializer().run()
train_queue = reader.ptb_producer(train_data, train_model.batch_size, train_model.num_steps)
eval_queue = reader.ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)
test_queue = reader.ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=session, coord=coord)
for i in range(NUM_EPOCH):
print("In iteration: %d" % (i + 1))
run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)
valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)
print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))
test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)
print("Test Perplexity: %.3f" % test_perplexity)
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
main()
'''
In iteration: 1
After 0 steps, perplexity is 10009.690
After 100 steps, perplexity is 1465.472
After 200 steps, perplexity is 1081.721
After 300 steps, perplexity is 900.029
After 400 steps, perplexity is 785.582
After 500 steps, perplexity is 707.795
After 600 steps, perplexity is 649.047
After 700 steps, perplexity is 598.087
After 800 steps, perplexity is 553.027
After 900 steps, perplexity is 518.264
After 1000 steps, perplexity is 491.357
After 1100 steps, perplexity is 465.917
After 1200 steps, perplexity is 444.835
After 1300 steps, perplexity is 425.917
Epoch: 1 Validation Perplexity: 238.228
In iteration: 2
After 0 steps, perplexity is 350.195
After 100 steps, perplexity is 243.940
After 200 steps, perplexity is 248.997
After 300 steps, perplexity is 249.440
After 400 steps, perplexity is 246.536
After 500 steps, perplexity is 243.698
After 600 steps, perplexity is 243.138
After 700 steps, perplexity is 240.505
After 800 steps, perplexity is 235.897
After 900 steps, perplexity is 233.252
After 1000 steps, perplexity is 231.533
After 1100 steps, perplexity is 228.082
After 1200 steps, perplexity is 225.515
After 1300 steps, perplexity is 222.819
Epoch: 2 Validation Perplexity: 181.821
Test Perplexity: 177.882
'''