根据提供的截图内容,我们可以看到一个名为VitNet
的视觉Transformer(Vision Transformer,简称ViT)网络架构的部分代码。下面我将提供完整的VitNet
类以及相关的辅助函数和训练流程示例代码。
计算机视觉、图像处理、毕业辅导、作业帮助、代码获取,远程协助,代码定制,私聊会回复!
B站项目实战:https://space.bilibili.com/364224477
如果文章对你有帮助的话, 欢迎评论 点赞 收藏
QQ+加:673276993
♂代做需求:@个人主页
import torch
import torch.nn as nn
import math
class Mlp(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks """
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class VitNet(nn.Module):
def __init__(self, image_size, patch_size, out_channel, in_channel=3, D=1024,
num_layers=4, MLP_hidden=64, num_head=3, head_channel=64,
dropout=0.1):
super(VitNet, self).__init__()
self.h, self.w = image_size
self.p1, self.p2 = patch_size
assert self.h % self.p1 == 0 and self.w % self.p2 == 0
self.N = (self.h // self.p1) ** 2
self.in_channel = in_channel
self.out_channel = out_channel
self.D = D
self.num_layers = num_layers
self.patch_embedding = nn.Conv2d(in_channels=in_channel, out_channels=D, kernel_size=(patch_size[0], patch_size[1]), stride=(patch_size[0], patch_size[1]))
self.positional_encoding = nn.Parameter(torch.randn(1, self.N + 1, D))
self.cls_token = nn.Parameter(torch.randn(1, 1, D))
self.dropout = nn.Dropout(p=dropout)
self.blocks = nn.ModuleList([
Block(dim=D, num_heads=num_head, mlp_ratio=MLP_hidden/D, qkv_bias=True, drop=0., attn_drop=0.)
for _ in range(num_layers)])
self.head = nn.Sequential(
nn.LayerNorm(D),
nn.Linear(D, out_channel)
)
def forward(self, x):
x = self.patch_embedding(x)
x = x.flatten(2).transpose(1, 2)
cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
x = torch.cat((cls_tokens, x), dim=1)
x += self.positional_encoding[:, :x.size(1)]
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.head(x[:, 0])
return x
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from GarbageClsDataset import GarbageClsDataset # 假设这是你的数据集类
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 参数设置
image_size = (224, 224)
patch_size = (16, 16)
out_channel = 10 # 输出类别数
in_channel = 3
D = 1024
num_layers = 4
MLP_hidden = 64
num_head = 3
head_channel = 64
dropout = 0.1
batch_size = 32
learning_rate = 1e-4
epochs = 10
# 创建模型实例
model = VitNet(image_size=image_size, patch_size=patch_size, out_channel=out_channel, in_channel=in_channel, D=D,
num_layers=num_layers, MLP_hidden=MLP_hidden, num_head=num_head, head_channel=head_channel, dropout=dropout)
model.to(device)
# 准备数据集
dataset = GarbageClsDataset(transform=ToTensor()) # 使用适当的transform
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# 设置优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
for epoch in range(epochs):
running_loss = 0.0
correct_predictions = 0
total_samples = 0
for inputs, labels in dataloader:
inputs, labels = inputs.to(device), labels.to(device)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 统计指标
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total_samples += labels.size(0)
correct_predictions += (predicted == labels).sum().item()
# 打印结果
avg_loss = running_loss / len(dataloader)
accuracy = correct_predictions / total_samples
print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
请注意,上述代码是基于给定的信息构建的一个基本框架。你需要根据实际情况调整参数和数据集部分。此外,为了使代码完整运行,还需要实现GarbageClsDataset
类以加载和预处理数据。
构建一个完整的视觉Transformer(ViT)图像分类模型。
首先,我们需要定义一个数据集类来加载和预处理图像数据。假设我们使用的是一个简单的图像分类数据集。
# src/utils/datasets.py
import os
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
class GarbageClsDataset(Dataset):
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.transform = transform
self.image_paths = []
self.labels = []
# 假设数据集目录结构为: root_dir/class_name/image.jpg
for class_name in os.listdir(root_dir):
class_dir = os.path.join(root_dir, class_name)
for image_name in os.listdir(class_dir):
image_path = os.path.join(class_dir, image_name)
self.image_paths.append(image_path)
self.labels.append(class_name)
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
image_path = self.image_paths[idx]
label = self.labels[idx]
image = Image.open(image_path).convert('RGB')
if self.transform:
image = self.transform(image)
return image, label
定义数据预处理和加载的数据加载器。
# src/utils/data_loader.py
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor, Resize, Normalize
from .datasets import GarbageClsDataset
def get_data_loaders(root_dir, image_size, batch_size, num_workers=4):
transform = Compose([
Resize(image_size),
ToTensor(),
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = GarbageClsDataset(root_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
return dataloader
定义视觉Transformer(ViT)模型。
# src/models/vit.py
import torch
import torch.nn as nn
import math
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
self.drop_path = nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class VitNet(nn.Module):
def __init__(self, image_size, patch_size, out_channel, in_channel=3, D=1024,
num_layers=4, MLP_hidden=64, num_head=3, head_channel=64,
dropout=0.1):
super(VitNet, self).__init__()
self.h, self.w = image_size
self.p1, self.p2 = patch_size
assert self.h % self.p1 == 0 and self.w % self.p2 == 0
self.N = (self.h // self.p1) * (self.w // self.p2)
self.in_channel = in_channel
self.out_channel = out_channel
self.D = D
self.num_layers = num_layers
self.patch_embedding = nn.Conv2d(in_channels=in_channel, out_channels=D, kernel_size=(patch_size[0], patch_size[1]), stride=(patch_size[0], patch_size[1]))
self.positional_encoding = nn.Parameter(torch.randn(1, self.N + 1, D))
self.cls_token = nn.Parameter(torch.randn(1, 1, D))
self.dropout = nn.Dropout(p=dropout)
self.blocks = nn.ModuleList([
Block(dim=D, num_heads=num_head, mlp_ratio=MLP_hidden/D, qkv_bias=True, drop=0., attn_drop=0.)
for _ in range(num_layers)])
self.head = nn.Sequential(
nn.LayerNorm(D),
nn.Linear(D, out_channel)
)
def forward(self, x):
x = self.patch_embedding(x)
x = x.flatten(2).transpose(1, 2)
cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
x = torch.cat((cls_tokens, x), dim=1)
x += self.positional_encoding[:, :x.size(1)]
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.head(x[:, 0])
return x
定义训练和验证的主函数。
# src/train.py
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from models.vit import VitNet
from utils.data_loader import get_data_loaders
import time
def train_model(train_dataloader, val_dataloader, model, criterion, optimizer, num_epochs, device):
best_val_accuracy = 0.0
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
correct_predictions = 0
total_samples = 0
for inputs, labels in train_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 统计指标
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total_samples += labels.size(0)
correct_predictions += (predicted == labels).sum().item()
# 打印训练结果
avg_train_loss = running_loss / len(train_dataloader)
train_accuracy = correct_predictions / total_samples
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
# 验证
model.eval()
val_correct_predictions = 0
val_total_samples = 0
with torch.no_grad():
for inputs, labels in val_dataloader:
inputs, labels = inputs.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
val_total_samples += labels.size(0)
val_correct_predictions += (predicted == labels).sum().item()
val_accuracy = val_correct_predictions / val_total_samples
print(f'Validation Accuracy: {val_accuracy:.4f}')
# 保存最佳模型
if val_accuracy > best_val_accuracy:
best_val_accuracy = val_accuracy
torch.save(model.state_dict(), 'models/best_vit_model.pth')
if __name__ == "__main__":
# 参数设置
image_size = (224, 224)
patch_size = (16, 16)
out_channel = 10 # 输出类别数
in_channel = 3
D = 1024
num_layers = 4
MLP_hidden = 64
num_head = 3
head_channel = 64
dropout = 0.1
batch_size = 32
learning_rate = 1e-4
num_epochs = 10
# 创建模型实例
model = VitNet(image_size=image_size, patch_size=patch_size, out_channel=out_channel, in_channel=in_channel, D=D,
num_layers=num_layers, MLP_hidden=MLP_hidden, num_head=num_head, head_channel=head_channel, dropout=dropout)
model.to(device)
# 准备数据集
train_dataloader = get_data_loaders('data/train', image_size, batch_size)
val_dataloader = get_data_loaders('data/val', image_size, batch_size)
# 设置优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
# 训练模型
train_model(train_dataloader, val_dataloader, model, criterion, optimizer, num_epochs, device)
GarbageClsDataset
)get_data_loaders
)VitNet
)train_model
)python src/train.py
通过以上步骤,你可以构建一个完整的视觉Transformer(ViT)图像分类模型,包括数据集的加载、模型的定义、训练和验证的完整流程。