FBRT-YOLO(Faster and Better for Real-Time Aerial Image Detection)是由北京理工大学团队提出的专用于航拍图像实时目标检测的创新框架,发表于AAAI 2025。论文针对航拍场景中小目标检测的核心难题展开研究,重点解决小目标因分辨率低、背景干扰多导致的定位困难,以及现有方法在实时性与精度间的失衡问题。
航拍图像目标检测是无人机、遥感监测等应用的关键技术,但面临独特挑战:图像中目标(如车辆、行人)通常仅由少量像素(<0.1%图像面积)构成,且易受云层、建筑群等复杂背景干扰。传统方法通过增加分辨率提升精度,但显著增加计算负担,难以满足嵌入式设备(如无人机芯片)的实时需求。FBRT-YOLO通过轻量化设计,在Visdrone、UAVDT和AI-TOD三大航拍数据集上实现了精度与速度的突破性平衡。
论文链接:
FCM模块致力于解决深层网络中小目标空间信息丢失这一根本问题。传统特征金字塔(如FPN)虽融合深浅层特征,但主干网络在传递过程中仍会弱化小目标的精确位置信息。FCM通过“拆分-变换-互补映射-聚合”四步策略实现信息融合:
针对航拍图像中目标尺度差异大的挑战,MKP取代了YOLO的最终下采样层,采用多尺度卷积核级联结构增强感受野适应性:
传统检测器在高分辨率图像处理中存在结构冗余。FBRT-YOLO通过两项精简策略优化效率:
FBRT-YOLO以YOLOv8为基线,主干网络嵌入FCM模块替代原C2f单元,并在最后一层用MKP单元替换下采样操作。整体架构分为三阶段:
在三大航拍数据集上的测试表明,FBRT-YOLO全面超越现有实时检测器:
表2:Visdrone数据集上模型性能对比
模型 | AP(%) | 参数量(M) | GFLOPs | 推理速度(FPS) |
---|---|---|---|---|
YOLOv8-S | 27.8 | 11.1 | 28.6 | 142 |
FBRT-YOLO-S | 30.1 | 2.9 | 22.8 | 189 |
RT-DETR-R34 | 28.9 | 19.2 | 98.3 | 156 |
FBRT-YOLO-M | 30.2 | 18.7 | 76.5 | 173 |
以YOLOv8-S为基线,逐步引入改进组件:
可视化热图显示,FBRT-YOLO对密集小目标的响应强度显著高于基线模型,尤其在车辆群、行人小目标等场景中关注区域更精确。
FBRT-YOLO通过特征互补映射模块(FCM) 与多内核感知单元(MKP) 的创新设计,解决了航拍图像检测中小目标信息丢失和多尺度适应性不足的核心问题。其贡献主要体现在三方面:
import torch
import torch.nn as nn
import torch.nn.functional as F
def autopad(k, p=None, d=1): # kernel, padding, dilation
"""Pad to 'same' shape outputs."""
if d > 1:
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
"""Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
"""Initialize Conv layer with given arguments including activation."""
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""Apply convolution, batch normalization and activation to input tensor."""
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
"""Perform transposed convolution of 2D data."""
return self.act(self.conv(x))
class Pzconv(nn.Module):
def __init__(self, dim):
super().__init__()
self.conv1 = nn.Conv2d(
dim, dim, 3,
1, 1, groups=dim
)
self.conv2 = Conv(dim, dim, k=1, s=1, )
self.conv3 = nn.Conv2d(
dim, dim, 5,
1, 2, groups=dim
)
self.conv4 = Conv(dim, dim, 1, 1)
self.conv5 = nn.Conv2d(
dim, dim, 7,
1, 3, groups=dim
)
def forward(self, x):
x1 = self.conv1(x)
x2 = self.conv2(x1)
x3 = self.conv3(x2)
x4 = self.conv4(x3)
x5 = self.conv5(x4)
x6 = x5 + x
return x6
# print(x.shape)
if __name__ == "__main__":
# 定义输入张量大小(Batch、Channel、Height、Wight)
B, C, H, W = 16, 64, 40, 40
input_tensor = torch.randn(B,C,H,W) # 随机生成输入张量
dim=C
# 创建 ARConv 实例
block = Pzconv(dim=64)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sablock = block.to(device)
print(sablock)
input_tensor = input_tensor.to(device)
# 执行前向传播
output = sablock(input_tensor)
# 打印输入和输出的形状
print(f"Input: {input_tensor.shape}")
print(f"Output: {output.shape}")
def autopad(k, p=None, d=1):
"""自动计算padding大小以保持输出尺寸不变"""
if d > 1:
# 计算实际卷积核大小(考虑膨胀率)
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]
if p is None:
# 自动计算padding(卷积核大小的一半)
p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
return p
class Conv(nn.Module):
"""标准卷积模块(卷积+BN+激活)"""
default_act = nn.SiLU() # 默认激活函数为SiLU
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
super().__init__()
# 创建卷积层(自动计算padding)
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2) # 批归一化
# 设置激活函数(默认为SiLU)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""应用卷积、批归一化和激活函数"""
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
"""直接应用卷积和激活函数(跳过批归一化)"""
return self.act(self.conv(x))
class Pzconv(nn.Module):
def __init__(self, dim):
"""
初始化Pzconv模块
:param dim: 输入和输出的通道数
"""
super().__init__()
# 3x3深度可分离卷积(保持尺寸不变)
self.conv1 = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim)
# 1x1标准卷积(包含BN和激活)
self.conv2 = Conv(dim, dim, k=1, s=1)
# 5x5深度可分离卷积(保持尺寸不变)
self.conv3 = nn.Conv2d(dim, dim, 5, 1, 2, groups=dim)
# 1x1标准卷积(包含BN和激活)
self.conv4 = Conv(dim, dim, k=1, s=1)
# 7x7深度可分离卷积(保持尺寸不变)
self.conv5 = nn.Conv2d(dim, dim, 7, 1, 3, groups=dim)
def forward(self, x):
"""前向传播过程"""
# 第一层:3x3深度可分离卷积
x1 = self.conv1(x)
# 第二层:1x1标准卷积(带BN和激活)
x2 = self.conv2(x1)
# 第三层:5x5深度可分离卷积
x3 = self.conv3(x2)
# 第四层:1x1标准卷积(带BN和激活)
x4 = self.conv4(x3)
# 第五层:7x7深度可分离卷积
x5 = self.conv5(x4)
# 残差连接:原始输入 + 最终输出
x6 = x5 + x
return x6
if __name__ == "__main__":
# 定义输入张量大小(Batch, Channel, Height, Width)
B, C, H, W = 16, 64, 40, 40
input_tensor = torch.randn(B, C, H, W) # 随机生成输入张量
# 创建Pzconv实例
block = Pzconv(dim=C)
# 检测并设置设备(GPU优先)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sablock = block.to(device)
input_tensor = input_tensor.to(device)
# 执行前向传播
output = sablock(input_tensor)
# 打印输入和输出的形状
print(f"Input: {input_tensor.shape}")
print(f"Output: {output.shape}")
Pzconv模块结合了三种不同尺度的卷积核:
模块中的conv1
、conv3
和conv5
使用了深度可分离卷积:
groups=dim
参数使卷积在通道维度上独立进行模块中的conv2
和conv4
是1×1标准卷积:
x5
与原始输入x
相加所有卷积层都通过合理的padding设置保持了特征图的空间尺寸不变: