python
import math
import torch
import torch.nn as nn
from LabmL_helpers.module import Module
from labml_n.utils import clone_module_List
from typing import Optional, List
from torch.utils.data import DataLoader, TensorDataset
from torch import optim
import torch.nn.functional as F
python
import numpy as np
from math import sqrt
import torch
from torch import nn
class Self_Attention(nn.Module):
# input : batch_size * seq_len * input_dim
# q : batch_size * input_dim * dim_k
# k : batch_size * input_dim * dim_k
# v : batch_size * input_dim * dim_v
def __init__(self, input_dim, dim_k, dim_v):
super(Self_Attention, self).__init__()
self.q = nn.Linear(input_dim, dim_k)
self.k = nn.Linear(input_dim, dim_k)
self.v = nn.Linear(input_dim, dim_v)
self._norm_fact = 1 / sqrt(dim_k)
def forward(self, x):
Q = self.q(x) # Q: batch_size * seq_len * dim_k
K = self.k(x) # K: batch_size * seq_len * dim_k
V = self.v(x) # V: batch_size * seq_len * dim_v
# Q * K.T() # batch_size * seq_len * seq_len
atten = nn.Softmax(
dim=-1)(torch.bmm(Q, K.permute(0, 2, 1))) * self._norm_fact
# Q * K.T() * V # batch_size * seq_len * dim_v
output = torch.bmm(atten, V)
return output
X = torch.randn(4, 3, 2)
print(X)
self_atten = Self_Attention(2, 4, 5) # input_dim:2, k_dim:4, v_dim:5
res = self_atten(X)
print(res.shape) # [4,3,5]
python
from math import sqrt
import torch
import torch.nn as nn
class Self_Attention_Muti_Head(nn.Module):
# input : batch_size * seq_len * input_dim
# q : batch_size * input_dim * dim_k
# k : batch_size * input_dim * dim_k
# v : batch_size * input_dim * dim_v
def __init__(self, input_dim, dim_k, dim_v, nums_head):
super(Self_Attention_Muti_Head, self).__init__()
assert dim_k % nums_head == 0
assert dim_v % nums_head == 0
self.q = nn.Linear(input_dim, dim_k)
self.k = nn.Linear(input_dim, dim_k)
self.v = nn.Linear(input_dim, dim_v)
self.nums_head = nums_head
self.dim_k = dim_k
self.dim_v = dim_v
self._norm_fact = 1 / sqrt(dim_k)
def forward(self, x):
Q = self.q(x).reshape(-1, x.shape[0], x.shape[1], self.dim_k //
self.nums_head)
K = self.k(x).reshape(-1, x.shape[0], x.shape[1], self.dim_k //
self.nums_head)
V = self.v(x).reshape(-1, x.shape[0], x.shape[1], self.dim_v //
self.nums_head)
print(x.shape)
print(Q.size())
atten = nn.Softmax(dim=-1)(torch.matmul(Q, K.permute(0, 1, 3, 2))) # Q * K.T() # batch_size * seq_len * seq_len
output = torch.matmul(atten, V).reshape(x.shape[0], x.shape[1], -1) # Q * K.T() * V # batch_size * seq_len * dim_v
return output
x = torch.rand(1, 3, 4)
print(x)
atten = Self_Attention_Muti_Head(4, 4, 4, 2)
y = atten(x)
print(y.shape)
python
class SELayer(nn.Module):
def __init__(self, channel, reduction=16):
super(SELayer, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=False),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c) # 对应Squeeze操作
y = self.fc(y).view(b, c, 1, 1) # 对应Excitation操作
return x * y.expand_as(x)
python
class GCT(nn.Module):
def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):
super(GCT, self).__init__()
self.alpha = nn.Parameter(torch.ones(1, num_channels, 1, 1))
self.gamma = nn.Parameter(torch.zeros(1, num_channels, 1, 1))
self.beta = nn.Parameter(torch.zeros(1, num_channels, 1, 1))
self.epsilon = epsilon
self.mode = mode
self.after_relu = after_relu
def forward(self, x):
if self.mode == 'l2':
embedding = (x.pow(2).sum((2, 3), keepdim=True) +
self.epsilon).pow(0.5) * self.alpha
norm = self.gamma / \
(embedding.pow(2).mean(dim=1, keepdim=True) +
self.epsilon).pow(0.5)
elif self.mode == 'l1':
if not self.after_relu:
_x = torch.abs(x)
else:
_x = x
embedding = _x.sum((2, 3), keepdim=True) * self.alpha
norm = self.gamma / \
(torch.abs(embedding).mean(dim=1, keepdim=True) + self.epsilon)
gate = 1. + torch.tanh(embedding * norm + self.beta)
return x * gate
GCT 建议添加在 Conv 层前,一般可以先冻结原来的模型,来训练 GCT,然后解冻再进行微调。