想象你在翻译长句子时的思考过程:
传统Seq2Seq模型的问题:
注意力机制的解决方案:
三步计算过程:
评分(Score):计算解码器当前状态与所有编码器状态的相关性
权重(Attention Weights):通过softmax归一化得分
上下文向量(Context Vector):加权求和编码器状态
context = ∑(attention_weights * encoder_states)
类型 | 公式 | 特点 |
---|---|---|
加性Attention | score = vᵀ tanh(W₁h + W₂s) |
计算量大但灵活 |
点积Attention | score = hᵀs |
计算简单但需维度匹配 |
缩放点积Attention | score = hᵀs/√dₖ |
Transformer使用,最佳实践 |
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 下载数据集
path_to_file = tf.keras.utils.get_file(
'fra-eng.zip',
origin='http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip',
extract=True
)
path_to_file = path_to_file.replace('.zip', '.txt')
# 读取数据
def load_data(path):
df = pd.read_csv(path, sep='\t', header=None, names=['en', 'fr'])
# 添加开始和结束标记
df['fr'] = ' ' + df['fr'] + ' '
return df.sample(50000) # 使用5万条数据
df = load_data(path_to_file)
print(df.head())
# 查看样本长度分布
df['en_len'] = df['en'].apply(lambda x: len(x.split()))
df['fr_len'] = df['fr'].apply(lambda x: len(x.split()))
print("\n英语平均长度:", df['en_len'].mean())
print("法语平均长度:", df['fr_len'].mean())
# 可视化长度分布
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.hist(df['en_len'], bins=30)
plt.title('英语句子长度')
plt.subplot(1,2,2)
plt.hist(df['fr_len'], bins=30)
plt.title('法语句子长度')
plt.show()
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 配置参数
MAX_LEN = 20
VOCAB_SIZE = 10000
# 英语分词器
en_tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters='')
en_tokenizer.fit_on_texts(df['en'])
en_vocab_size = len(en_tokenizer.word_index) + 1
# 法语分词器
fr_tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters='')
fr_tokenizer.fit_on_texts(df['fr'])
fr_vocab_size = len(fr_tokenizer.word_index) + 1
# 序列化和填充
def preprocess_sentences(sentences, tokenizer, max_len):
seq = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(seq, maxlen=max_len, padding='post')
return padded
# 准备训练数据
input_data = preprocess_sentences(df['en'], en_tokenizer, MAX_LEN)
target_data = preprocess_sentences(df['fr'], fr_tokenizer, MAX_LEN)
# 数据集划分
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
input_data, target_data, test_size=0.2, random_state=42)
print("\n英语词汇量:", en_vocab_size)
print("法语词汇量:", fr_vocab_size)
print("训练样本数:", len(X_train))
print("验证样本数:", len(X_val))
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
# 编码器
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(en_vocab_size, 256)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c] # 保存最后状态
# 解码器
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(fr_vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
# 注意力机制
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])
decoder_concat = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention])
# 输出层
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat)
# 定义训练模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()
# 解码器输入和目标数据准备
decoder_input_data = y_train[:, :-1] # 去掉最后一个词
decoder_target_data = y_train[:, 1:] # 去掉第一个词(start token)
# 验证数据
val_decoder_input = y_val[:, :-1]
val_decoder_target = y_val[:, 1:]
# 数据生成器(节省内存)
def data_generator(encoder_input, decoder_input, decoder_target, batch_size=64):
num_samples = len(encoder_input)
while True:
for offset in range(0, num_samples, batch_size):
batch_encoder_input = encoder_input[offset:offset+batch_size]
batch_decoder_input = decoder_input[offset:offset+batch_size]
batch_decoder_target = decoder_target[offset:offset+batch_size]
yield [batch_encoder_input, batch_decoder_input], batch_decoder_target
# 创建生成器
train_gen = data_generator(X_train, decoder_input_data, decoder_target_data)
val_gen = data_generator(X_val, val_decoder_input, val_decoder_target)
# 训练配置
steps_per_epoch = len(X_train) // 64
validation_steps = len(X_val) // 64
# 添加模型保存回调
checkpoint = tf.keras.callbacks.ModelCheckpoint(
'transformer_model.h5',
save_best_only=True,
monitor='val_loss',
mode='min'
)
# 开始训练
history = model.fit(
train_gen,
steps_per_epoch=steps_per_epoch,
epochs=10,
validation_data=val_gen,
validation_steps=validation_steps,
callbacks=[checkpoint]
)
# 可视化训练过程
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.legend()
plt.show()
# 编码器推理模型
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])
# 解码器推理模型
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb_inf = Embedding(fr_vocab_size, 256)(decoder_inputs)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
dec_emb_inf, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
# 注意力层
attention_inf = tf.keras.layers.Attention()([decoder_outputs_inf, encoder_outputs])
decoder_concat_inf = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs_inf, attention_inf])
decoder_outputs_inf = decoder_dense(decoder_concat_inf)
decoder_model = Model(
[decoder_inputs] + [encoder_outputs] + decoder_states_inputs,
[decoder_outputs_inf] + decoder_states_inf)
# 翻译函数
def translate(input_seq):
# 编码输入句子
enc_out, h, c = encoder_model.predict(input_seq)
# 初始化解码器输入
target_seq = np.zeros((1, 1))
target_seq[0, 0] = fr_tokenizer.word_index['' ]
stop_condition = False
decoded_sentence = []
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + [enc_out] + [h, c])
# 采样下一个词
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_word = fr_tokenizer.index_word.get(sampled_token_index, '')
decoded_sentence.append(sampled_word)
# 退出条件:达到最大长度或遇到结束标记
if (sampled_word == '' or len(decoded_sentence) > MAX_LEN):
stop_condition = True
# 更新目标序列
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
return ' '.join(decoded_sentence[:-1]) # 去掉
# 测试翻译
def test_translation(n=5):
for i in range(n):
idx = np.random.randint(0, len(X_val))
input_seq = X_val[idx:idx+1]
english = ' '.join([en_tokenizer.index_word.get(i, '') for i in input_seq[0] if i != 0])
french = translate(input_seq)
print(f"\n英语: {english}")
print(f"翻译: {french}")
test_translation()
# 修改解码器模型以输出注意力权重
attention_layer = model.layers[-3] # 根据模型结构调整索引
attention_model = Model(
inputs=model.inputs,
outputs=[model.outputs[0], attention_layer.output]
)
# 可视化函数
def plot_attention(input_seq, translated_words):
# 获取注意力权重
_, attention_weights = attention_model.predict(input_seq)
# 准备输入和输出词
input_text = [en_tokenizer.index_word.get(i, '') for i in input_seq[0] if i != 0]
output_text = translated_words.split()
# 绘制热力图
plt.figure(figsize=(10,5))
plt.imshow(attention_weights[0, :len(output_text), :len(input_text)], cmap='viridis')
plt.xticks(range(len(input_text)), input_text, rotation=90)
plt.yticks(range(len(output_text)), output_text)
plt.xlabel('输入词')
plt.ylabel('输出词')
plt.title('注意力权重可视化')
plt.colorbar()
plt.show()
# 示例可视化
idx = np.random.randint(0, len(X_val))
input_seq = X_val[idx:idx+1]
translation = translate(input_seq)
plot_attention(input_seq, translation)
lesson_27_attention/
├── README.md
├── requirements.txt
├── attention_translation.py # 主程序文件
├── utils/
│ ├── data_loader.py # 数据加载工具
│ └── visualization.py # 可视化工具
├── models/ # 保存的模型
│ └── transformer_model.h5
└── output/ # 输出结果
├── training_curve.png
├── attention_heatmap.png
└── sample_translations.txt
tensorflow==2.8.0
numpy==1.21.0
matplotlib==3.4.0
pandas==1.3.0
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.data_loader import load_and_preprocess_data
from utils.visualization import plot_attention
class AttentionTranslator:
def __init__(self, max_len=20, vocab_size=10000):
self.max_len = max_len
self.vocab_size = vocab_size
self.en_tokenizer = None
self.fr_tokenizer = None
self.model = None
self.encoder_model = None
self.decoder_model = None
def build_model(self, en_vocab_size, fr_vocab_size):
# 编码器
encoder_inputs = tf.keras.Input(shape=(None,))
enc_emb = tf.keras.layers.Embedding(en_vocab_size, 256)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# 解码器
decoder_inputs = tf.keras.Input(shape=(None,))
dec_emb = tf.keras.layers.Embedding(fr_vocab_size, 256)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
# 注意力机制
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])
decoder_concat = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention])
# 输出层
decoder_dense = tf.keras.layers.Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat)
# 定义模型
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
return model
def train(self, X_train, y_train, X_val, y_val, epochs=10):
# 准备数据
decoder_input_data = y_train[:, :-1]
decoder_target_data = y_train[:, 1:]
val_decoder_input = y_val[:, :-1]
val_decoder_target = y_val[:, 1:]
# 训练模型
checkpoint = tf.keras.callbacks.ModelCheckpoint(
'models/transformer_model.h5',
save_best_only=True,
monitor='val_loss'
)
history = self.model.fit(
[X_train, decoder_input_data],
decoder_target_data,
batch_size=64,
epochs=epochs,
validation_data=([X_val, val_decoder_input], val_decoder_target),
callbacks=[checkpoint]
)
# 保存训练曲线
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.legend()
plt.savefig('output/training_curve.png')
plt.close()
return history
def build_inference_models(self):
# 编码器推理模型
encoder_outputs = self.model.layers[4].output
state_h = self.model.layers[4].output[1]
state_c = self.model.layers[4].output[2]
self.encoder_model = tf.keras.Model(
self.model.input[0],
[encoder_outputs, state_h, state_c]
)
# 解码器推理模型
decoder_inputs = self.model.input[1]
decoder_embedding = self.model.layers[5]
decoder_lstm = self.model.layers[6]
decoder_dense = self.model.layers[-1]
# 推理模型输入
decoder_state_input_h = tf.keras.Input(shape=(256,))
decoder_state_input_c = tf.keras.Input(shape=(256,))
encoder_outputs_input = tf.keras.Input(shape=(None, 256))
# 推理模型计算
dec_emb_inf = decoder_embedding(decoder_inputs)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c])
attention_inf = tf.keras.layers.Attention()([decoder_outputs_inf, encoder_outputs_input])
decoder_concat_inf = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs_inf, attention_inf])
decoder_outputs_inf = decoder_dense(decoder_concat_inf)
self.decoder_model = tf.keras.Model(
[decoder_inputs, encoder_outputs_input, decoder_state_input_h, decoder_state_input_c],
[decoder_outputs_inf, state_h_inf, state_c_inf]
)
def translate(self, input_seq):
# 编码输入
enc_out, h, c = self.encoder_model.predict(input_seq)
# 初始化解码
target_seq = np.zeros((1, 1))
target_seq[0, 0] = self.fr_tokenizer.word_index['' ]
decoded_sentence = []
while True:
output_tokens, h, c = self.decoder_model.predict(
[target_seq, enc_out, h, c])
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_word = self.fr_tokenizer.index_word.get(sampled_token_index, '')
if sampled_word == '' or len(decoded_sentence) > self.max_len:
break
decoded_sentence.append(sampled_word)
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
return ' '.join(decoded_sentence)
def main():
# 加载数据
df, (X_train, X_val, y_train, y_val) = load_and_preprocess_data()
# 初始化翻译器
translator = AttentionTranslator()
translator.en_tokenizer = en_tokenizer # 假设已定义
translator.fr_tokenizer = fr_tokenizer
# 构建模型
translator.model = translator.build_model(
len(en_tokenizer.word_index)+1,
len(fr_tokenizer.word_index)+1
)
# 训练
translator.train(X_train, y_train, X_val, y_val, epochs=10)
# 构建推理模型
translator.build_inference_models()
# 测试翻译
test_idx = np.random.randint(0, len(X_val))
input_seq = X_val[test_idx:test_idx+1]
english = ' '.join([translator.en_tokenizer.index_word.get(i, '')
for i in input_seq[0] if i != 0])
french = translator.translate(input_seq)
print(f"\n英语: {english}")
print(f"翻译: {french}")
# 保存示例
with open('output/sample_translations.txt', 'w') as f:
f.write(f"英语: {english}\n")
f.write(f"翻译: {french}\n")
# 可视化注意力
plot_attention(translator.model, input_seq, french,
translator.en_tokenizer, translator.fr_tokenizer,
save_path='output/attention_heatmap.png')
if __name__ == "__main__":
main()
Epoch 1/10
625/625 [==============================] - 45s 65ms/step - loss: 2.8543 - val_loss: 2.1234
Epoch 2/10
625/625 [==============================] - 40s 64ms/step - loss: 1.9821 - val_loss: 1.7652
...
Epoch 10/10
625/625 [==============================] - 40s 64ms/step - loss: 1.1234 - val_loss: 1.4567
英语: she is sleeping
翻译: elle dort
models/transformer_model.h5
: 训练好的模型权重output/training_curve.png
: 训练损失曲线output/attention_heatmap.png
: 注意力权重热力图output/sample_translations.txt
: 翻译示例改进方法:
可能原因:
实现步骤:
优化建议:
通过本讲我们掌握了:
注意力机制是NLP领域的重大突破,这些知识将为你学习更先进的Transformer模型奠定坚实基础。