YOLOv3(You Only Look Once version 3)是一种高效的实时目标检测算法,由Joseph Redmon和Ali Farhadi于2018年提出。与传统的目标检测方法相比,YOLO将目标检测视为单一的回归问题,直接从完整图像预测边界框及其类别概率,使其成为速度和准确性之间平衡的优秀选择。
本教程适合以下人群:
首先,我们需要创建一个虚拟环境(可选但推荐):
bash
# 创建虚拟环境
python -m venv yolov3_env
# 激活虚拟环境
# Windows:
yolov3_env\Scripts\activate
# Linux/Mac:
source yolov3_env/bin/activate
安装必要的库:
bash
# 安装基本库
pip install numpy opencv-python-headless matplotlib pillow
# 使用CPU版PyTorch
pip install torch torchvision
# 或使用GPU版PyTorch (根据你的CUDA版本选择适当的命令)
# 请访问 https://pytorch.org/get-started/locally/ 获取适用于你系统的安装命令
如果你想使用官方的darknet实现,你需要克隆并编译darknet库:
bash
git clone https://github.com/AlexeyAB/darknet.git
cd darknet
# 编辑Makefile,设置GPU=1、CUDNN=1(如果有GPU)
# 在Windows上,使用Makefile.win文件
# Linux/Mac编译:
make
或者,我们可以使用更简单的Python实现,如Ultralytics的YOLOv3:
bash
pip install ultralytics
YOLOv3提供了多种预训练模型,最常用的是在COCO数据集上训练的模型。
bash
# 下载预训练权重
wget https://github.com/ultralytics/yolov3/releases/download/v9.0/yolov3.pt -O yolov3.pt
如果使用原始的darknet实现:
bash
# 下载配置文件和权重
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3.cfg
wget https://pjreddie.com/media/files/yolov3.weights
YOLOv3预训练模型可以检测80种不同的物体,包括人、车辆、动物和日常物品。完整的类别列表如下:
python
COCO_CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
现在我们开始实际操作,使用YOLOv3模型检测图像中的目标。
创建一个Python脚本 detect_image.py
:
python
from ultralytics import YOLO
import cv2
import numpy as np
import time
def detect_objects(image_path, conf_threshold=0.25):
# 加载模型
model = YOLO('yolov3.pt')
# 读取图像
img = cv2.imread(image_path)
if img is None:
print(f"无法读取图像: {image_path}")
return
# 记录开始时间
start_time = time.time()
# 执行推理
results = model(img)
# 计算推理时间
inference_time = time.time() - start_time
print(f"推理时间: {inference_time:.2f}秒")
# 获取检测结果
result = results[0]
# 在图像上绘制检测结果
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
conf = float(box.conf[0])
cls_id = int(box.cls[0])
if conf >= conf_threshold:
label = f"{result.names[cls_id]} {conf:.2f}"
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# 保存和显示结果
output_path = "output_" + image_path.split("/")[-1]
cv2.imwrite(output_path, img)
print(f"已保存结果到: {output_path}")
# 显示结果 (可选,如果在有图形界面的环境下)
cv2.imshow("Detection Result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
# 替换为你自己的图像路径
image_path = "test_image.jpg"
detect_objects(image_path)
如果你想使用OpenCV的DNN模块直接加载darknet模型,可以使用以下代码:
python
import cv2
import numpy as np
import time
import argparse
def detect_objects_opencv(image_path, config_path, weights_path, conf_threshold=0.5, nms_threshold=0.4):
# 加载类别名称
with open('coco.names', 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# 设置随机颜色
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(classes), 3), dtype=np.uint8)
# 加载网络
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
# 检查可用的计算后端
backend = cv2.dnn.DNN_BACKEND_OPENCV
target = cv2.dnn.DNN_TARGET_CPU
# 如果有CUDA支持,可以启用GPU
if cv2.cuda.getCudaEnabledDeviceCount() > 0:
backend = cv2.dnn.DNN_BACKEND_CUDA
target = cv2.dnn.DNN_TARGET_CUDA
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
# 获取输出层名称
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
# 读取图像
img = cv2.imread(image_path)
if img is None:
print(f"无法读取图像: {image_path}")
return
height, width = img.shape[:2]
# 图像预处理: 创建blob
blob = cv2.dnn.blobFromImage(img, 1/255.0, (416, 416), swapRB=True, crop=False)
net.setInput(blob)
# 记录开始时间
start_time = time.time()
# 前向传播,获取检测结果
outputs = net.forward(output_layers)
# 计算推理时间
inference_time = time.time() - start_time
print(f"推理时间: {inference_time:.2f}秒")
# 处理检测结果
class_ids = []
confidences = []
boxes = []
for output in outputs:
for detection in output:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > conf_threshold:
# 目标位置
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# 计算左上角坐标
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
# 非最大抑制,移除重叠框
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
# 绘制检测结果
for i in indices:
i = i if isinstance(i, int) else i[0] # 处理不同OpenCV版本的兼容性
box = boxes[i]
x, y, w, h = box
label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
color = [int(c) for c in colors[class_ids[i]]]
# 绘制边界框和标签
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
cv2.putText(img, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
# 保存结果
output_path = "output_" + image_path.split("/")[-1]
cv2.imwrite(output_path, img)
print(f"已保存结果到: {output_path}")
# 显示结果 (可选)
cv2.imshow("Detection Result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3目标检测')
parser.add_argument('--image', type=str, default='test_image.jpg', help='输入图像路径')
parser.add_argument('--config', type=str, default='yolov3.cfg', help='模型配置文件路径')
parser.add_argument('--weights', type=str, default='yolov3.weights', help='模型权重文件路径')
parser.add_argument('--conf', type=float, default=0.5, help='置信度阈值')
args = parser.parse_args()
# 在运行前确保你有coco.names文件
# 可以从这里下载: https://github.com/AlexeyAB/darknet/blob/master/data/coco.names
detect_objects_opencv(args.image, args.config, args.weights, args.conf)
确保你在同一目录下有一个coco.names
文件,包含所有COCO类别:
bash
# 下载coco.names文件
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/data/coco.names
现在让我们扩展到视频目标检测,这对于监控、行为分析等应用非常有用。
创建一个脚本detect_video.py
:
python
from ultralytics import YOLO
import cv2
import time
import argparse
def detect_video(video_path, conf_threshold=0.25, output_path=None):
# 加载模型
model = YOLO('yolov3.pt')
# 打开视频文件或摄像头
if video_path.isdigit():
cap = cv2.VideoCapture(int(video_path)) # 摄像头
else:
cap = cv2.VideoCapture(video_path) # 视频文件
if not cap.isOpened():
print(f"无法打开视频源: {video_path}")
return
# 获取视频参数
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
# 准备输出视频
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
else:
out = None
# 处理视频帧
frame_count = 0
start_time = time.time()
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# 执行检测
results = model(frame)
result = results[0]
# 在帧上绘制检测结果
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
conf = float(box.conf[0])
cls_id = int(box.cls[0])
if conf >= conf_threshold:
label = f"{result.names[cls_id]} {conf:.2f}"
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# 显示实时FPS
current_time = time.time()
elapsed_time = current_time - start_time
fps_text = f"FPS: {frame_count / elapsed_time:.2f}"
cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# 保存输出帧
if out:
out.write(frame)
# 显示结果
cv2.imshow("Video Detection", frame)
# 按'q'键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 清理资源
cap.release()
if out:
out.release()
cv2.destroyAllWindows()
print(f"处理了 {frame_count} 帧,平均 FPS: {frame_count / elapsed_time:.2f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3视频目标检测')
parser.add_argument('--video', type=str, default='0', help='输入视频路径或摄像头索引 (默认为0,表示默认摄像头)')
parser.add_argument('--conf', type=float, default=0.25, help='置信度阈值')
parser.add_argument('--output', type=str, default=None, help='输出视频路径 (可选)')
args = parser.parse_args()
detect_video(args.video, args.conf, args.output)
如果你想使用OpenCV DNN模块处理视频:
python
import cv2
import numpy as np
import time
import argparse
def detect_video_opencv(video_path, config_path, weights_path, conf_threshold=0.5, nms_threshold=0.4, output_path=None):
# 加载类别名称
with open('coco.names', 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# 设置随机颜色
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(classes), 3), dtype=np.uint8)
# 加载网络
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
# 检查可用的计算后端
backend = cv2.dnn.DNN_BACKEND_OPENCV
target = cv2.dnn.DNN_TARGET_CPU
# 如果有CUDA支持,可以启用GPU
if cv2.cuda.getCudaEnabledDeviceCount() > 0:
backend = cv2.dnn.DNN_BACKEND_CUDA
target = cv2.dnn.DNN_TARGET_CUDA
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
# 获取输出层名称
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
# 打开视频文件或摄像头
if video_path.isdigit():
cap = cv2.VideoCapture(int(video_path)) # 摄像头
else:
cap = cv2.VideoCapture(video_path) # 视频文件
if not cap.isOpened():
print(f"无法打开视频源: {video_path}")
return
# 获取视频参数
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
# 准备输出视频
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
else:
out = None
# 处理视频帧
frame_count = 0
start_time = time.time()
processing_times = []
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
frame_start_time = time.time()
# 图像预处理
blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
net.setInput(blob)
# 前向传播
outputs = net.forward(output_layers)
# 处理检测结果
class_ids = []
confidences = []
boxes = []
for output in outputs:
for detection in output:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > conf_threshold:
# 目标位置
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# 计算左上角坐标
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
# 非最大抑制
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
# 绘制检测结果
for i in indices:
i = i if isinstance(i, int) else i[0] # 处理不同OpenCV版本的兼容性
box = boxes[i]
x, y, w, h = box
label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
color = [int(c) for c in colors[class_ids[i]]]
# 绘制边界框和标签
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
cv2.putText(frame, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
# 计算处理时间
frame_time = time.time() - frame_start_time
processing_times.append(frame_time)
# 显示实时FPS
current_time = time.time()
elapsed_time = current_time - start_time
fps_text = f"FPS: {frame_count / elapsed_time:.2f}"
cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# 保存输出帧
if out:
out.write(frame)
# 显示结果
cv2.imshow("Video Detection", frame)
# 按'q'键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 清理资源
cap.release()
if out:
out.release()
cv2.destroyAllWindows()
# 打印统计信息
avg_time = sum(processing_times) / len(processing_times) if processing_times else 0
print(f"处理了 {frame_count} 帧")
print(f"平均处理时间: {avg_time:.4f} 秒/帧")
print(f"平均 FPS: {1 / avg_time:.2f}" if avg_time > 0 else "无法计算 FPS")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3视频目标检测')
parser.add_argument('--video', type=str, default='0', help='输入视频路径或摄像头索引 (默认为0,表示默认摄像头)')
parser.add_argument('--config', type=str, default='yolov3.cfg', help='模型配置文件路径')
parser.add_argument('--weights', type=str, default='yolov3.weights', help='模型权重文件路径')
parser.add_argument('--conf', type=float, default=0.5, help='置信度阈值')
parser.add_argument('--output', type=str, default=None, help='输出视频路径 (可选)')
args = parser.parse_args()
detect_video_opencv(args.video, args.config, args.weights, args.conf, 0.4, args.output)
YOLOv3是一个功能强大的模型,但在资源有限的环境中,可能需要进行一些优化。以下是一些实用的优化技巧:
默认情况下,YOLOv3使用416×416的输入分辨率,但你可以降低它以提高速度:
python
# 例如,将分辨率降低到320×320
blob = cv2.dnn.blobFromImage(img, 1/255.0, (320, 320), swapRB=True, crop=False)
YOLOv3-tiny是YOLOv3的轻量级版本,速度更快但准确性略低:
bash
# 下载YOLOv3-tiny的配置和权重
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-tiny.cfg
wget https://pjreddie.com/media/files/yolov3-tiny.weights
然后修改代码,使用这些文件。
对于OpenCV DNN实现:
python
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
对于Ultralytics实现,它会自动利用可用的GPU。
如果你需要处理多个图像,可以考虑批处理来提高吞吐量:
python
# 使用Ultralytics YOLOv3的批处理
results = model(batch_of_images, batch_size=4)
对于更高级的优化,可以考虑模型量化和TensorRT优化(需要NVIDIA GPU):
bash
# 使用ONNX和TensorRT(需要额外步骤)
pip install onnx onnxruntime-gpu
# 转换模型到ONNX格式,然后使用TensorRT优化