这个文件是一个 Python 脚本(hw04.py),用于完成 NTU 2021 Spring 机器学习课程的 HW4 作业任务:扬声器分类(Speaker Classification)。它主要通过 Transformer 模型(尤其是自注意力机制,Self-Attention)来实现分类,并提供了训练和推理代码。以下我会详细讲解文件的结构,重点教你如何使用 Transformer 和 Self-Attention,并让你明白如何训练模型、调整参数。
先简单讲解 Transformer 和 Self-Attention 的原理,然后结合代码看它们如何被使用。
python
收起自动换行运行
复制class Classifier(nn.Module): def __init__(self, d_model=80, n_spks=600, dropout=0.1): super().__init__() self.prenet = nn.Linear(40, d_model) self.encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, dim_feedforward=256, nhead=2 ) self.pred_layer = nn.Sequential( nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, n_spks), )
python
收起自动换行运行
复制def forward(self, mels): out = self.prenet(mels) # (batch size, length, 40) -> (batch size, length, d_model) out = out.permute(1, 0, 2) # (batch size, length, d_model) -> (length, batch size, d_model) out = self.encoder_layer(out) # Transformer 编码 out = out.transpose(0, 1) # (length, batch size, d_model) -> (batch size, length, d_model) stats = out.mean(dim=1) # 平均池化:(batch size, d_model) out = self.pred_layer(stats) # (batch size, n_spks) return out
通过这个代码,我教你如何在 PyTorch 中使用 Transformer 来完成一个分类任务。
python
收起自动换行运行
复制self.encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, # 输入和输出的特征维度 nhead=2, # 多头注意力的头数,d_model 必须能被 nhead 整除 dim_feedforward=256, # 前馈网络的隐藏层维度 dropout=0.1 # dropout 比例 )
python
收起自动换行运行
复制self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
训练一个模型需要准备数据、定义模型、设置优化器和学习率调度器,然后进入训练循环。以下是代码中的训练过程解析。
python
收起自动换行运行
复制for step in range(total_steps): batch = next(train_iterator) loss, accuracy = model_fn(batch, model, criterion, device) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad()
调整 Transformer 参数是 HW4 的 Medium 级别任务。以下是代码中可以调整的部分,以及调整的意义。
python
收起自动换行运行
复制self.encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, dim_feedforward=256, nhead=2 )
python
收起自动换行运行
复制self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
如果你想自己训练一个类似模型,可以用以下步骤:
由于你之前没有接触过 Conformer,我会从零开始详细讲解之前提供的 ConformerBlock 和 ConformerConvModule 的代码实现,逐行分析其功能、原理和实现细节。Conformer 是 Transformer 的变种,专为语音任务设计,结合了全局建模(Self-Attention)和局部建模(卷积)的优势。以下代码来自之前修改的 hw04.py,我们会重点聚焦 Conformer 的实现部分。
以下是 ConformerConvModule 和 ConformerBlock 的完整代码,我会逐行解释其作用。
import torch
import torch.nn as nn
import torch.nn.functional as F
# 卷积模块,用于捕捉局部特征
class ConformerConvModule(nn.Module):
def __init__(self, d_model=80, kernel_size=31, dropout=0.1):
super().__init__()
# Pointwise Convolution 1
self.pointwise_conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size=1, stride=1, padding=0, bias=True)
self.glu = nn.GLU(dim=1) # Gated Linear Unit
# Depthwise Convolution
self.depthwise_conv = nn.Conv1d(
d_model,
d_model,
kernel_size=kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
groups=d_model, # Depthwise
bias=True
)
self.bn = nn.BatchNorm1d(d_model)
self.swish = nn.Swish()
# Pointwise Convolution 2
self.pointwise_conv2 = nn.Conv1d(d_model, d_model, kernel_size=1, stride=1, padding=0, bias=True)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x: (batch, length, d_model) -> (batch, d_model, length) for conv
x = x.transpose(1, 2)
# Pointwise Conv 1 + GLU
x = self.pointwise_conv1(x)
x = self.glu(x)
# Depthwise Conv + BN + Swish
x = self.depthwise_conv(x)
x = self.bn(x)
x = self.swish(x)
# Pointwise Conv 2
x = self.pointwise_conv2(x)
x = self.dropout(x)
# Back to (batch, length, d_model)
x = x.transpose(1, 2)
return x
# Conformer 块,包含 FFN、Self-Attention 和卷积模块
class ConformerBlock(nn.Module):
def __init__(self, d_model=80, nhead=2, dim_feedforward=256, dropout=0.1, kernel_size=31):
super().__init__()
# Feed-Forward Module (half-step)
self.ffn1 = nn.Sequential(
nn.LayerNorm(d_model),
nn.Linear(d_model, dim_feedforward),
nn.Swish(),
nn.Dropout(dropout),
nn.Linear(dim_feedforward, d_model)
)
# Multi-Head Self-Attention
self.self_attention = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.norm1 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
# Convolution Module
self.conv_module = ConformerConvModule(d_model, kernel_size, dropout)
self.norm2 = nn.LayerNorm(d_model)
self.dropout2 = nn.Dropout(dropout)
# Feed-Forward Module (half-step)
self.ffn2 = nn.Sequential(
nn.LayerNorm(d_model),
nn.Linear(d_model, dim_feedforward),
nn.Swish(),
nn.Dropout(dropout),
nn.Linear(dim_feedforward, d_model)
)
self.norm3 = nn.LayerNorm(d_model)
self.dropout3 = nn.Dropout(dropout)
def forward(self, x):
# x: (length, batch, d_model)
# FFN 1 (half-step)
x = x + 0.5 * self.dropout1(self.ffn1(x))
# Multi-Head Self-Attention
attn_output, _ = self.self_attention(x, x, x)
x = self.norm1(x + self.dropout1(attn_output))
# Convolution Module
x = self.norm2(x + self.dropout2(self.conv_module(x)))
# FFN 2 (half-step)
x = self.norm3(x + self.dropout3(self.ffn2(x)))
return x
python
收起自动换行运行
复制
class ConformerConvModule(nn.Module): def __init__(self, d_model=80, kernel_size=31, dropout=0.1):
python
收起自动换行运行
复制
super().__init__()
python
收起自动换行运行
复制
self.pointwise_conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size=1, stride=1, padding=0, bias=True)
python
收起自动换行运行
复制
self.glu = nn.GLU(dim=1) # Gated Linear Unit
python
收起自动换行运行
复制
self.depthwise_conv = nn.Conv1d( d_model, d_model, kernel_size=kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=d_model, # Depthwise bias=True )
python
收起自动换行运行
复制
self.bn = nn.BatchNorm1d(d_model)
python
收起自动换行运行
复制
self.swish = nn.Swish()
python
收起自动换行运行
复制
self.pointwise_conv2 = nn.Conv1d(d_model, d_model, kernel_size=1, stride=1, padding=0, bias=True)
python
收起自动换行运行
复制
self.dropout = nn.Dropout(dropout)
python
收起自动换行运行
复制
def forward(self, x):
python
收起自动换行运行
复制
x = x.transpose(1, 2)
python
收起自动换行运行
复制
x = self.pointwise_conv1(x)
python
收起自动换行运行
复制
x = self.glu(x)
python
收起自动换行运行
复制
x = self.depthwise_conv(x)
python
收起自动换行运行
复制
x = self.bn(x)
python
收起自动换行运行
复制
x = self.swish(x)
python
收起自动换行运行
复制
x = self.pointwise_conv2(x)
python
收起自动换行运行
复制
x = self.dropout(x)
python
收起自动换行运行
复制
x = x.transpose(1, 2)
python
收起自动换行运行
复制
return x
python
收起自动换行运行
复制
class ConformerBlock(nn.Module): def __init__(self, d_model=80, nhead=2, dim_feedforward=256, dropout=0.1, kernel_size=31):
python
收起自动换行运行
复制
super().__init__()
python
收起自动换行运行
复制
self.ffn1 = nn.Sequential( nn.LayerNorm(d_model), nn.Linear(d_model, dim_feedforward), nn.Swish(), nn.Dropout(dropout), nn.Linear(dim_feedforward, d_model) )
python
收起自动换行运行
复制
self.self_attention = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
python
收起自动换行运行
复制
self.norm1 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout)
python
收起自动换行运行
复制
self.conv_module = ConformerConvModule(d_model, kernel_size, dropout)
python
收起自动换行运行
复制
self.norm2 = nn.LayerNorm(d_model) self.dropout2 = nn.Dropout(dropout)
python
收起自动换行运行
复制
self.ffn2 = nn.Sequential( nn.LayerNorm(d_model), nn.Linear(d_model, dim_feedforward), nn.Swish(), nn.Dropout(dropout), nn.Linear(dim_feedforward, d_model) )
python
收起自动换行运行
复制
self.norm3 = nn.LayerNorm(d_model) self.dropout3 = nn.Dropout(dropout)
python
收起自动换行运行
复制
def forward(self, x):
python
收起自动换行运行
复制
x = x + 0.5 * self.dropout1(self.ffn1(x))
python
收起自动换行运行
复制
attn_output, _ = self.self_attention(x, x, x)
python
收起自动换行运行
复制
x = self.norm1(x + self.dropout1(attn_output))
python
收起自动换行运行
复制
x = self.norm2(x + self.dropout2(self.conv_module(x)))
python
收起自动换行运行
复制
x = self.norm3(x + self.dropout3(self.ffn2(x)))
python
收起自动换行运行
复制
return x