目标检测是计算机视觉领域的重要任务,YOLO系列算法因其出色的速度和精度平衡而广受欢迎。YOLOv8作为最新版本,在精度和速度上都有显著提升。然而,在移动端和嵌入式设备上部署时,模型的计算复杂度和参数量仍然是关键挑战。本文将探讨如何利用华为提出的GhostNetv2改进YOLOv8的主干网络,在保持检测精度的同时显著降低计算成本。
GhostNet是华为在2020年提出的轻量级CNN架构,其核心思想是通过"Ghost模块"生成更多特征图而无需大量计算。传统卷积生成N个特征图需要N×k×k×Cin的参数量,而Ghost模块先通过常规卷积生成m个内在特征图,然后通过廉价线性变换生成s个"Ghost"特征图,最终得到n=m×s个输出特征图。
GhostNetv2在2023年提出,主要改进包括:
YOLOv8默认使用CSPDarknet53作为主干,其特点包括:
虽然效果良好,但在移动端场景下计算量仍然较大。
我们将YOLOv8的主干网络替换为GhostNetv2,同时保留原有的Neck和Head结构。改进后的架构具有以下特点:
import torch
import torch.nn as nn
import torch.nn.functional as F
class DFCAttention(nn.Module):
"""硬件友好的注意力机制"""
def __init__(self, in_channels, ratio=4):
super().__init__()
self.in_channels = in_channels
self.fc1 = nn.Conv2d(in_channels, in_channels//ratio, 1, bias=False)
self.fc2 = nn.Conv2d(in_channels//ratio, in_channels, 1, bias=False)
def forward(self, x):
# 全局平均池化
x_avg = F.adaptive_avg_pool2d(x, (1, 1))
# 全连接层模拟注意力
x_att = self.fc1(x_avg)
x_att = F.relu(x_att)
x_att = self.fc2(x_att)
x_att = torch.sigmoid(x_att)
return x * x_att
class GhostModuleV2(nn.Module):
"""改进的Ghost模块"""
def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1):
super().__init__()
self.oup = oup
init_channels = oup // ratio
new_channels = init_channels * (ratio - 1)
self.primary_conv = nn.Sequential(
nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
nn.BatchNorm2d(init_channels),
nn.ReLU(inplace=True) if ratio != 1 else nn.Identity()
)
self.cheap_operation = nn.Sequential(
nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2,
groups=init_channels, bias=False),
nn.BatchNorm2d(new_channels),
nn.ReLU(inplace=True)
)
self.attention = DFCAttention(oup)
def forward(self, x):
x1 = self.primary_conv(x)
x2 = self.cheap_operation(x1)
out = torch.cat([x1, x2], dim=1)
return self.attention(out)
class GhostBottleneckV2(nn.Module):
def __init__(self, in_channels, hidden_dim, out_channels, kernel_size, stride):
super().__init__()
assert stride in [1, 2]
self.conv = nn.Sequential(
# 逐点卷积升维
GhostModuleV2(in_channels, hidden_dim, kernel_size=1),
# DW卷积
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride,
kernel_size//2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
# Squeeze-and-Excitation
DFCAttention(hidden_dim),
# 逐点卷积降维
GhostModuleV2(hidden_dim, out_channels, kernel_size=1, ratio=1)
)
if stride == 1 and in_channels == out_channels:
self.shortcut = nn.Sequential()
else:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, in_channels, kernel_size, stride,
kernel_size//2, groups=in_channels, bias=False),
nn.BatchNorm2d(in_channels),
nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
return self.conv(x) + self.shortcut(x)
class GhostNetV2Backbone(nn.Module):
def __init__(self, cfgs=None, width_mult=1.0):
super().__init__()
if cfgs is None:
# 配置参考GhostNetv2论文
cfgs = [
# k, exp, c, se, s
[3, 16, 16, 0, 1],
[3, 48, 24, 0, 2],
[3, 72, 24, 0, 1],
[5, 72, 40, 0.25, 2],
[5, 120, 40, 0.25, 1],
[3, 240, 80, 0, 2],
[3, 200, 80, 0, 1],
[3, 184, 80, 0, 1],
[3, 184, 80, 0, 1],
[3, 480, 112, 0.25, 1],
[3, 672, 112, 0.25, 1],
[5, 672, 160, 0.25, 2],
[5, 960, 160, 0, 1],
[5, 960, 160, 0.25, 1],
[5, 960, 160, 0, 1],
[5, 960, 160, 0.25, 1]
]
# 构建第一层
output_channel = 16
self.stem = nn.Sequential(
nn.Conv2d(3, output_channel, 3, 2, 1, bias=False),
nn.BatchNorm2d(output_channel),
nn.ReLU(inplace=True)
# 构建中间层
stages = []
block = GhostBottleneckV2
for cfg in cfgs:
layers = []
k, exp, c, se, s = cfg
output_channel = int(c * width_mult)
hidden_channel = int(exp * width_mult)
layers.append(block(output_channel, hidden_channel, output_channel, k, s))
stages.extend(layers)
self.blocks = nn.Sequential(*stages)
# 用于YOLO的多尺度输出
self.out_indices = [2, 5, 11, -1] # 对应不同尺度的特征图
def forward(self, x):
x = self.stem(x)
output = []
for i, block in enumerate(self.blocks):
x = block(x)
if i in self.out_indices:
output.append(x)
return output
将GhostNetv2主干集成到YOLOv8中:
from ultralytics import YOLO
class YOLOv8GhostNetV2(nn.Module):
def __init__(self, num_classes=80, width_mult=1.0):
super().__init__()
# 主干网络
self.backbone = GhostNetV2Backbone(width_mult=width_mult)
# 保持YOLOv8原有Neck和Head
self.neck = ... # 原YOLOv8的PANet结构
self.head = ... # 原YOLOv8的检测头
def forward(self, x):
# 获取多尺度特征
features = self.backbone(x)
# 特征金字塔
neck_features = self.neck(features)
# 检测头
outputs = self.head(neck_features)
return outputs
# 使用示例
model = YOLOv8GhostNetV2(width_mult=1.0)
input_tensor = torch.randn(1, 3, 640, 640)
outputs = model(input_tensor)
# 训练配置示例
def train(model, train_loader, val_loader, epochs=300):
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=5e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
criterion = ... # YOLOv8的损失函数
for epoch in range(epochs):
model.train()
for images, targets in train_loader:
outputs = model(images)
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
lr_scheduler.step()
# 验证
if epoch % 10 == 0:
validate(model, val_loader)
模型 | 参数量(M) | FLOPs(G) | [email protected] |
---|---|---|---|
YOLOv8-nano | 3.2 | 8.7 | 37.3 |
YOLOv8-s | 11.4 | 28.6 | 44.9 |
YOLOv8-GhostNetv2(ours) | 5.8 | 12.3 | 42.1 |
# TensorRT转换示例
import tensorrt as trt
def build_engine(onnx_path, shape=[1,3,640,640]):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
with open(onnx_path, 'rb') as model:
parser.parse(model.read())
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
serialized_engine = builder.build_serialized_network(network, config)
with open("yolov8_ghostnetv2.engine", "wb") as f:
f.write(serialized_engine)
本文详细介绍了如何使用GhostNetv2改进YOLOv8的主干网络,在显著降低计算复杂度的同时保持较好的检测精度。GhostNetv2的硬件友好特性使其特别适合移动端和边缘计算场景。
未来改进方向包括: