超详细yolov8/11-segment实例分割全流程概述:配置环境、数据标注、训练、验证/预测、onnx部署(c++/python)详解

因为yolo的检测/分割/姿态/旋转/分类模型的环境配置、训练、推理预测等命令非常类似,这里不再详细叙述,主要参考**【YOLOv8/11-detect目标检测全流程教程】**,下面有相关链接,这里主要针对数据标注、格式转换、模型部署等不同细节部分;

【YOLOv8/11-detect目标检测全流程教程】超详细yolo8/11-detect目标检测全流程概述:配置环境、数据标注、训练、验证/预测、onnx部署(c++/python)详解
【环境配置】Ubuntu/Debian小白从零开始配置深度学习环境和各种软件库(显卡驱动、CUDA、CUDNN、Pytorch、OpenCv、PCL、Cmake
…)【持续维护】
【yolo全家桶github官网】https://github.com/ultralytics/ultralytics
【yolo说明文档】https://docs.ultralytics.com/zh/

文章目录

  • 一、数据准备(标注和转换)
    • 1.1分割大模型标注
    • 1.2预训练模型onnx自动标注
  • 二、模型部署
    • c++版本
    • python版本

一、数据准备(标注和转换)

1.1分割大模型标注

分割标注要比检测麻烦很多,一个个的圈多边形,建议使用X–anylabeling,可以加载分割大模型sam,速度要快很多,加载时候需要科学上网 下载对应的onnx模型,若是界面上点击会自动下载不用再配置,若是手动下载onnx模型需要写配置模型yaml文件,并在选择模型时候加载这个文件。这个.yaml文件主要修改模型路径。下面是yaml文件的大致内容:
sam_vit_b_01ec64 模型onnx 【百度网盘】 【csdn免费资源文件】

type: segment_anything
name: segment_anything_vit_b_quant-r20230520
display_name: Segment Anything (ViT-Base Quant)
# encoder_model_path: https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.2.0/sam_vit_b_01ec64.encoder.quant.onnx
# decoder_model_path: https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.2.0/sam_vit_b_01ec64.decoder.quant.onnx
encoder_model_path: D:\CvHub_YoLo_obb\sam_vit_h_4b8939.encoder.quant.onnx
decoder_model_path: D:\CvHub_YoLo_obb\sam_vit_h_4b8939.decoder.quant.onnx
input_size: 1024
max_width: 1024
max_height: 682

使用时候,确保电脑有足够内存,要不然大模型推理速度很慢,当分割不是特别精确时候,多添加先验点和背景点(滤除不必要的边缘),有时候背景点很重要;点击Finish object,sam分割结束后,若效果不理想,还可以进行微调;

1.2预训练模型onnx自动标注

这个前提是我们已经有目标预训练好的模型,格式onnx,加载这个模型,可以实现一次性标注;比较适用于,可以先标注少部分,训练后一个模型,然后使用这个模型全标注,再微调即可;如果想添加数据集,二次训练,使用这个功能最好了;
1.先写配置文件yolov8n_seg.yaml

type: yolov8_seg
name: yolov8n-seg-r20230620
display_name: YOLOv8n-Seg-My-Model Ultralytics  #界面显示名字
#model_path: https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n-seg.onnx
model_path: /home/xxx/yolov11/yolo_base/model/yolo11n-seg.onnx  #关键是这个路径
nms_threshold: 0.45
confidence_threshold: 0.25
classes:
  - person
  - bicycle
  - car
  - motorcycle
  - airplane
  - bus
  - train
  - truck
  - boat

2.加载模型,就是加载yaml文件,然后一次性标注所有文件就可以;

标注完成后,需要格式转换,运行下面脚本jsion2txt_seg.py,修改 路径和label名称

#json2txt_yolo11_seg.py
import cv2
import os
import json
import glob
import numpy as np

class_names = ["0"]
#class_names = ["0","1","2"]

def convert_json_label_to_yolov_seg_label():
    json_path = "./seg0613"  # 本地json路径
    json_files = glob.glob(json_path + "/*.json")
    # print(json_files)

    # 指定输出文件夹
    output_folder = "./seg_txt"  # txt存放路径
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for json_file in json_files:
        # print(json_file)
        with open(json_file, 'r') as f:
            json_info = json.load(f)

        img = cv2.imread(os.path.join(json_path, json_info["imagePath"]))
        height, width, _ = img.shape
        np_w_h = np.array([[width, height]], np.int32)

        txt_file = os.path.join(output_folder, os.path.basename(json_file).replace(".json", ".txt"))

        with open(txt_file, "w") as f:
            for point_json in json_info["shapes"]:
                txt_content = ""
                np_points = np.array(point_json["points"], np.int32)
                label = point_json["label"]
                index = class_names.index(label)
                # print(type(label))
                norm_points = np_points / np_w_h
                norm_points_list = norm_points.tolist()
                txt_content += str(index) + " " + " ".join(
                    [" ".join([str(cell[0]), str(cell[1])]) for cell in norm_points_list]) + "\n"
                f.write(txt_content)


convert_json_label_to_yolov_seg_label()
print("end convert!!!")

数据转换完后,再运行txt_split_yolo11.py,把数据划分为训练集、评估集合测试集

# txt_split_yolo11.py
# 将图片和标注数据按比例切分为 训练集和测试集
import shutil
import random
import os
 
# 原始路径
image_original_path = "./seg0613/"
label_original_path = "./seg_txt/"
 
cur_path = os.getcwd() #获取当前工作目录
#cur_path = './chatou_seg'
# 训练集路径
train_image_path = os.path.join(cur_path, "data/images/train/")
train_label_path = os.path.join(cur_path, "data/labels/train/")
 
# 验证集路径
val_image_path = os.path.join(cur_path, "data/images/val/")
val_label_path = os.path.join(cur_path, "data/labels/val/")
 
# 测试集路径
test_image_path = os.path.join(cur_path, "data/images/test/")
test_label_path = os.path.join(cur_path, "data/labels/test/")
 
# 训练集目录
list_train = os.path.join(cur_path, "data/train.txt")
list_val = os.path.join(cur_path, "data/val.txt")
list_test = os.path.join(cur_path, "data/test.txt")

train_percent = 0.9
val_percent = 0.1
test_percent = 0.0
 
def del_file(path):
    for i in os.listdir(path):
        file_data = path + "\\" + i
        os.remove(file_data)
  
def mkdir():
    if not os.path.exists(train_image_path):
        os.makedirs(train_image_path)
    else:
        del_file(train_image_path)
    if not os.path.exists(train_label_path):
        os.makedirs(train_label_path)
    else:
        del_file(train_label_path)
 
    if not os.path.exists(val_image_path):
        os.makedirs(val_image_path)
    else:
        del_file(val_image_path)
    if not os.path.exists(val_label_path):
        os.makedirs(val_label_path)
    else:
        del_file(val_label_path)
 
    if not os.path.exists(test_image_path):
        os.makedirs(test_image_path)
    else:
        del_file(test_image_path)
    if not os.path.exists(test_label_path):
        os.makedirs(test_label_path)
    else:
        del_file(test_label_path)

def clearfile():
    if os.path.exists(list_train):
        os.remove(list_train)
    if os.path.exists(list_val):
        os.remove(list_val)
    if os.path.exists(list_test):
        os.remove(list_test) 
def main():
    mkdir()
    clearfile()
 
    file_train = open(list_train, 'w')
    file_val = open(list_val, 'w')
    file_test = open(list_test, 'w')
 
    total_txt = os.listdir(label_original_path)
    num_txt = len(total_txt)
    list_all_txt = range(num_txt)
 
    num_train = int(num_txt * train_percent)
    num_val = int(num_txt * val_percent)
    num_test = num_txt - num_train - num_val
 
    train = random.sample(list_all_txt, num_train)
    # train从list_all_txt取出num_train个元素
    # 所以list_all_txt列表只剩下了这些元素
    val_test = [i for i in list_all_txt if not i in train]
    # 再从val_test取出num_val个元素,val_test剩下的元素就是test
    val = random.sample(val_test, num_val)
 
    print("训练集数目:{}, 验证集数目:{}, 测试集数目:{}".format(len(train), len(val), len(val_test) - len(val)))
    for i in list_all_txt:
        name = total_txt[i][:-4]
 
        srcImage = image_original_path + name + '.bmp'
        srcLabel = label_original_path + name + ".txt"
 
        if i in train:
            dst_train_Image = train_image_path + name + '.bmp'
            dst_train_Label = train_label_path + name + '.txt'
            shutil.copyfile(srcImage, dst_train_Image)
            shutil.copyfile(srcLabel, dst_train_Label)
            file_train.write(dst_train_Image + '\n')
        elif i in val:
            dst_val_Image = val_image_path + name + '.bmp'
            dst_val_Label = val_label_path + name + '.txt'
            shutil.copyfile(srcImage, dst_val_Image)
            shutil.copyfile(srcLabel, dst_val_Label)
            file_val.write(dst_val_Image + '\n')
        else:
            dst_test_Image = test_image_path + name + '.bmp'
            dst_test_Label = test_label_path + name + '.txt'
            shutil.copyfile(srcImage, dst_test_Image)
            shutil.copyfile(srcLabel, dst_test_Label)
            file_test.write(dst_test_Image + '\n')
 
    file_train.close()
    file_val.close()
    file_test.close()
if __name__ == "__main__":
    main()

分割的训练、预测和导出onnx命令

#模型训练
yolo segment train data=dataset.yaml model=yolo11n.pt epochs=300 imgsz=1920 amp=False batch=2 lr0=0.001 mosaic=0.05 patience=200 
#模型预测
yolo segment predict model=runs/detect/train4/weights/best.pt source=/xxx、images/test save=true conf=0.4 iou=0.5
#模型导出
yolo export model=/xxx/yolov11/runs/segment/train4/weights/best.pt format=onnx opset=17 simplify=True

二、模型部署

c++版本

主要参考大佬github开源文件 https://github.com/UNeedCryDear/yolov8-opencv-onnxruntime-cpp
和检测相似,其中yolov8_utils.h和yolov8_utils.cpp文件不打出来了,和目标检测里面的一样,可以参考上面的大佬的,或者这篇文章 超详细yolo8/11-detect目标检测全流程概述:配置环境、数据标注、训练、验证/预测、onnx部署(c++/python)详解

主要涉及五个文件,main.cpp yolov8_utils.h yolov8_seg_onnx.h yolov8_utils.cpp yolov8_seg_onnx.cpp,其中yolov8_utils.h和yolov8_utils.cpp和yolo8/11-detect目标检测一样,这里就不贴码了。

yolov8_seg_onnx.h 分割的头文件

#pragma once
#include 
#include
#include 
#include "yolov8_utils.h"
#include
//#include   //if use OrtTensorRTProviderOptionsV2
//#include 

class Yolov8SegOnnx {
public:
	Yolov8SegOnnx() :_OrtMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeCPUOutput)) {};
	~Yolov8SegOnnx() {
		if (_OrtSession != nullptr)
			delete _OrtSession;
	};// delete _OrtMemoryInfo;


public:
	/** \brief Read onnx-model
	* \param[in] modelPath:onnx-model path
	* \param[in] isCuda:if true,use Ort-GPU,else run it on cpu.
	* \param[in] cudaID:if isCuda==true,run Ort-GPU on cudaID.
	* \param[in] warmUp:if isCuda==true,warm up GPU-model.
	*/
	bool ReadModel(const std::string& modelPath, bool isCuda = false, int cudaID = 0, bool warmUp = true);

	/** \brief  detect.
	* \param[in] srcImg:a 3-channels image.
	* \param[out] output:detection results of input image.
	*/
	bool OnnxDetect(cv::Mat& srcImg, std::vector<OutputParams>& output);
	/** \brief  detect,batch size= _batchSize
	* \param[in] srcImg:A batch of images.
	* \param[out] output:detection results of input images.
	*/
	bool OnnxBatchDetect(std::vector<cv::Mat>& srcImg, std::vector<std::vector<OutputParams>>& output);

private:

	template <typename T>
	T VectorProduct(const std::vector<T>& v)
	{
		return std::accumulate(v.begin(), v.end(), 1, std::multiplies<T>());
	};
	int PreProcessing(const std::vector<cv::Mat>& srcImgs, std::vector<cv::Mat>& outSrcImgs, std::vector<cv::Vec4d>& params);

	const int _netWidth = 1920;   //ONNX-net-input-width
	const int _netHeight = 1920;  //ONNX-net-input-height

	int _batchSize = 1;  //if multi-batch,set this
	bool _isDynamicShape = false;//onnx support dynamic shape
	float _classThreshold = 0.25;
	float _nmsThreshold = 0.45;
	float _maskThreshold = 0.4;


	//ONNXRUNTIME	
	Ort::Env _OrtEnv = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "Yolov8");
	Ort::SessionOptions _OrtSessionOptions = Ort::SessionOptions();
	Ort::Session* _OrtSession = nullptr;
	Ort::MemoryInfo _OrtMemoryInfo;
#if ORT_API_VERSION < ORT_OLD_VISON

	char* _inputName, * _output_name0, * _output_name1;
#else
	std::shared_ptr<char> _inputName, _output_name0,_output_name1;
#endif

	std::vector<char*> _inputNodeNames; //����ڵ���
	std::vector<char*> _outputNodeNames;//����ڵ���

	size_t _inputNodesNum = 0;        //����ڵ���
	size_t _outputNodesNum = 0;       //����ڵ���

	ONNXTensorElementDataType _inputNodeDataType; //��������
	ONNXTensorElementDataType _outputNodeDataType;
	std::vector<int64_t> _inputTensorShape; //��������shape

	std::vector<int64_t> _outputTensorShape;
	std::vector<int64_t> _outputMaskTensorShape;
public:
	std::vector<std::string> _className = {
		"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
		"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
		"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
		"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
		"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
		"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
		"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
		"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
		"hair drier", "toothbrush"
	};
};

yolov8_seg_onnx.cpp

//yolov8_seg_onnx.cpp

#include "yolov8_seg_onnx.h"
//using namespace std;
//using namespace cv;
//using namespace cv::dnn;
using namespace Ort;

bool Yolov8SegOnnx::ReadModel(const std::string& modelPath, bool isCuda, int cudaID, bool warmUp) {
	if (_batchSize < 1) _batchSize = 1;
	try
	{
		if (!CheckModelPath(modelPath))
			return false;
		std::vector<std::string> available_providers = GetAvailableProviders();
		auto cuda_available = std::find(available_providers.begin(), available_providers.end(), "CUDAExecutionProvider");

		if (isCuda && (cuda_available == available_providers.end()))
		{
			std::cout << "Your ORT build without GPU. Change to CPU." << std::endl;
			std::cout << "************* Infer model on CPU! *************" << std::endl;
		}
		else if (isCuda && (cuda_available != available_providers.end()))
		{
			std::cout << "************* Infer model on GPU! *************" << std::endl;
#if ORT_API_VERSION < ORT_OLD_VISON
			OrtCUDAProviderOptions cudaOption;
			cudaOption.device_id = cudaID;
			_OrtSessionOptions.AppendExecutionProvider_CUDA(cudaOption);
#else
			OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(_OrtSessionOptions, cudaID);
#endif
		}
		else
		{
			std::cout << "************* Infer model on CPU! *************" << std::endl;
		}
		//
		_OrtSessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

#ifdef _WIN32
		std::wstring model_path(modelPath.begin(), modelPath.end());
		_OrtSession = new Ort::Session(_OrtEnv, model_path.c_str(), _OrtSessionOptions);
#else
		_OrtSession = new Ort::Session(_OrtEnv, modelPath.c_str(), _OrtSessionOptions);
#endif

		Ort::AllocatorWithDefaultOptions allocator;
		//init input
		_inputNodesNum = _OrtSession->GetInputCount();
#if ORT_API_VERSION < ORT_OLD_VISON
		_inputName = _OrtSession->GetInputName(0, allocator);
		_inputNodeNames.push_back(_inputName);
#else
		_inputName = std::move(_OrtSession->GetInputNameAllocated(0, allocator));
		_inputNodeNames.push_back(_inputName.get());
#endif

		Ort::TypeInfo inputTypeInfo = _OrtSession->GetInputTypeInfo(0);
		auto input_tensor_info = inputTypeInfo.GetTensorTypeAndShapeInfo();
		_inputNodeDataType = input_tensor_info.GetElementType();
		_inputTensorShape = input_tensor_info.GetShape();

		if (_inputTensorShape[0] == -1)
		{
			_isDynamicShape = true;
			_inputTensorShape[0] = _batchSize;

		}
		if (_inputTensorShape[2] == -1 || _inputTensorShape[3] == -1) {
			_isDynamicShape = true;
			_inputTensorShape[2] = _netHeight;
			_inputTensorShape[3] = _netWidth;
		}
		//init output
		_outputNodesNum = _OrtSession->GetOutputCount();
		if (_outputNodesNum != 2) {
			std::cout << "This model has " << _outputNodesNum << "output, which is not a segmentation model.Please check your model name or path!" << std::endl;
			return false;
		}
#if ORT_API_VERSION < ORT_OLD_VISON
		_output_name0 = _OrtSession->GetOutputName(0, allocator);
		_output_name1 = _OrtSession->GetOutputName(1, allocator);
#else
		_output_name0 = std::move(_OrtSession->GetOutputNameAllocated(0, allocator));
		_output_name1 = std::move(_OrtSession->GetOutputNameAllocated(1, allocator));
#endif
		Ort::TypeInfo type_info_output0(nullptr);
		Ort::TypeInfo type_info_output1(nullptr);
		bool flag = false;
#if ORT_API_VERSION < ORT_OLD_VISON
		flag = strcmp(_output_name0, _output_name1) < 0;
#else
		flag = strcmp(_output_name0.get(), _output_name1.get()) < 0;
#endif
		if (flag)  //make sure "output0" is in front of  "output1"
		{
			type_info_output0 = _OrtSession->GetOutputTypeInfo(0);  //output0
			type_info_output1 = _OrtSession->GetOutputTypeInfo(1);  //output1
#if ORT_API_VERSION < ORT_OLD_VISON
			_outputNodeNames.push_back(_output_name0);
			_outputNodeNames.push_back(_output_name1);
#else
			_outputNodeNames.push_back(_output_name0.get());
			_outputNodeNames.push_back(_output_name1.get());
#endif

		}
		else {
			type_info_output0 = _OrtSession->GetOutputTypeInfo(1);  //output0
			type_info_output1 = _OrtSession->GetOutputTypeInfo(0);  //output1
#if ORT_API_VERSION < ORT_OLD_VISON
			_outputNodeNames.push_back(_output_name1);
			_outputNodeNames.push_back(_output_name0);
#else
			_outputNodeNames.push_back(_output_name1.get());
			_outputNodeNames.push_back(_output_name0.get());
#endif
		}

		auto tensor_info_output0 = type_info_output0.GetTensorTypeAndShapeInfo();
		_outputNodeDataType = tensor_info_output0.GetElementType();
		_outputTensorShape = tensor_info_output0.GetShape();
		auto tensor_info_output1 = type_info_output1.GetTensorTypeAndShapeInfo();
		//_outputMaskNodeDataType = tensor_info_output1.GetElementType(); //the same as output0
		//_outputMaskTensorShape = tensor_info_output1.GetShape();
		//if (_outputTensorShape[0] == -1)
		//{
		//	_outputTensorShape[0] = _batchSize;
		//	_outputMaskTensorShape[0] = _batchSize;
		//}
		//if (_outputMaskTensorShape[2] == -1) {
		//	//size_t ouput_rows = 0;
		//	//for (int i = 0; i < _strideSize; ++i) {
		//	//	ouput_rows += 3 * (_netWidth / _netStride[i]) * _netHeight / _netStride[i];
		//	//}
		//	//_outputTensorShape[1] = ouput_rows;

		//	_outputMaskTensorShape[2] = _segHeight;
		//	_outputMaskTensorShape[3] = _segWidth;
		//}
		//warm up
		if (isCuda && warmUp) {
			//draw run
			std::cout << "Start warming up" << std::endl;
			size_t input_tensor_length = VectorProduct(_inputTensorShape);
			float* temp = new float[input_tensor_length];
			std::vector<Ort::Value> input_tensors;
			std::vector<Ort::Value> output_tensors;
			input_tensors.push_back(Ort::Value::CreateTensor<float>(
				_OrtMemoryInfo, temp, input_tensor_length, _inputTensorShape.data(),
				_inputTensorShape.size()));
			for (int i = 0; i < 3; ++i) {
				output_tensors = _OrtSession->Run(Ort::RunOptions{ nullptr },
					_inputNodeNames.data(),
					input_tensors.data(),
					_inputNodeNames.size(),
					_outputNodeNames.data(),
					_outputNodeNames.size());
			}

			delete[]temp;
		}
	}
	catch (const std::exception&) {
		return false;
	}
	return true;
}

int Yolov8SegOnnx::PreProcessing(const std::vector<cv::Mat>& srcImgs, std::vector<cv::Mat>& outSrcImgs, std::vector<cv::Vec4d>& params) {
	outSrcImgs.clear();
	cv::Size input_size = cv::Size(_netWidth, _netHeight);
	for (int i = 0; i < srcImgs.size(); ++i) {
		cv::Mat temp_img = srcImgs[i];
		cv::Vec4d temp_param = { 1,1,0,0 };
		if (temp_img.size() != input_size) {
			cv::Mat borderImg;
			LetterBox(temp_img, borderImg, temp_param, input_size, false, false, true, 32);
			//std::cout << borderImg.size() << std::endl;
			outSrcImgs.push_back(borderImg);
			params.push_back(temp_param);
		}
		else {
			outSrcImgs.push_back(temp_img);
			params.push_back(temp_param);
		}
	}

	int lack_num = srcImgs.size() % _batchSize;
	if (lack_num != 0) {
		for (int i = 0; i < lack_num; ++i) {
			cv::Mat temp_img = cv::Mat::zeros(input_size, CV_8UC3);
			cv::Vec4d temp_param = { 1,1,0,0 };
			outSrcImgs.push_back(temp_img);
			params.push_back(temp_param);
		}
	}
	return 0;

}
bool Yolov8SegOnnx::OnnxDetect(cv::Mat& srcImg, std::vector<OutputParams>& output) {
	std::vector<cv::Mat> input_data = { srcImg };
	std::vector<std::vector<OutputParams>> tenp_output;
	if (OnnxBatchDetect(input_data, tenp_output)) {
		output = tenp_output[0];
		return true;
	}
	else return false;
}
bool Yolov8SegOnnx::OnnxBatchDetect(std::vector<cv::Mat>& srcImgs, std::vector<std::vector<OutputParams>>& output) {
	std::vector<cv::Vec4d> params;
	std::vector<cv::Mat> input_images;
	cv::Size input_size(_netWidth, _netHeight);
	//preprocessing
	PreProcessing(srcImgs, input_images, params);
	cv::Mat blob = cv::dnn::blobFromImages(input_images, 1 / 255.0, input_size, cv::Scalar(0, 0, 0), true, false);

	int64_t input_tensor_length = VectorProduct(_inputTensorShape);
	std::vector<Ort::Value> input_tensors;
	std::vector<Ort::Value> output_tensors;
	input_tensors.push_back(Ort::Value::CreateTensor<float>(_OrtMemoryInfo, (float*)blob.data, input_tensor_length, _inputTensorShape.data(), _inputTensorShape.size()));

	output_tensors = _OrtSession->Run(Ort::RunOptions{ nullptr },
		_inputNodeNames.data(),
		input_tensors.data(),
		_inputNodeNames.size(),
		_outputNodeNames.data(),
		_outputNodeNames.size()
	);

	//post-process
	float* all_data = output_tensors[0].GetTensorMutableData<float>();
	_outputTensorShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
	_outputMaskTensorShape = output_tensors[1].GetTensorTypeAndShapeInfo().GetShape();
	std::vector<int> mask_protos_shape = { 1,(int)_outputMaskTensorShape[1],(int)_outputMaskTensorShape[2],(int)_outputMaskTensorShape[3] };
	int mask_protos_length = VectorProduct(mask_protos_shape);
	int64_t one_output_length = VectorProduct(_outputTensorShape) / _outputTensorShape[0];
	int net_width = (int)_outputTensorShape[1];
	int socre_array_length = net_width - 4 - _outputMaskTensorShape[1];
	for (int img_index = 0; img_index < srcImgs.size(); ++img_index) {
		cv::Mat output0 = cv::Mat(cv::Size((int)_outputTensorShape[2], (int)_outputTensorShape[1]), CV_32F, all_data).t();  //[bs,116,8400]=>[bs,8400,116]
		all_data += one_output_length;
		float* pdata = (float*)output0.data;
		int rows = output0.rows;
		std::vector<int> class_ids;//���id����
		std::vector<float> confidences;//���ÿ��id��Ӧ���Ŷ�����
		std::vector<cv::Rect> boxes;//ÿ��id���ο�
		std::vector<std::vector<float>> picked_proposals;  //output0[:,:, 5 + _className.size():net_width]===> for mask
		for (int r = 0; r < rows; ++r) {    //stride
			cv::Mat scores(1, socre_array_length, CV_32F, pdata + 4);
			cv::Point classIdPoint;
			double max_class_socre;
			minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);
			max_class_socre = (float)max_class_socre;
			if (max_class_socre >= _classThreshold) {
				std::vector<float> temp_proto(pdata + 4 + socre_array_length, pdata + net_width);
				picked_proposals.push_back(temp_proto);
				//rect [x,y,w,h]
				float x = (pdata[0] - params[img_index][2]) / params[img_index][0];  //x
				float y = (pdata[1] - params[img_index][3]) / params[img_index][1];  //y
				float w = pdata[2] / params[img_index][0];  //w
				float h = pdata[3] / params[img_index][1];  //h
				int left = MAX(int(x - 0.5 * w + 0.5), 0);
				int top = MAX(int(y - 0.5 * h + 0.5), 0);
				class_ids.push_back(classIdPoint.x);
				confidences.push_back(max_class_socre);
				boxes.push_back(cv::Rect(left, top, int(w + 0.5), int(h + 0.5)));
			}
			pdata += net_width;//��һ��
		}

		std::vector<int> nms_result;
		cv::dnn::NMSBoxes(boxes, confidences, _classThreshold, _nmsThreshold, nms_result);
		std::vector<std::vector<float>> temp_mask_proposals;
		cv::Rect holeImgRect(0, 0, srcImgs[img_index].cols, srcImgs[img_index].rows);
		std::vector<OutputParams> temp_output;
		for (int i = 0; i < nms_result.size(); ++i) {
			int idx = nms_result[i];
			OutputParams result;
			result.id = class_ids[idx];
			result.confidence = confidences[idx];
			result.box = boxes[idx] & holeImgRect;
			temp_mask_proposals.push_back(picked_proposals[idx]);
			temp_output.push_back(result);
		}

		MaskParams mask_params;
		mask_params.params = params[img_index];
		mask_params.srcImgShape = srcImgs[img_index].size();
		mask_params.netHeight = _netHeight;
		mask_params.netWidth = _netWidth;
		mask_params.maskThreshold = _maskThreshold;
		cv::Mat mask_protos = cv::Mat(mask_protos_shape, CV_32F, output_tensors[1].GetTensorMutableData<float>() + img_index * mask_protos_length);
		for (int i = 0; i < temp_mask_proposals.size(); ++i) {
			GetMask2(cv::Mat(temp_mask_proposals[i]).t(), mask_protos, temp_output[i], mask_params);
		}

		//******************** ****************
		// �ϰ汾�ķ�������������ڿ�����ע�͵IJ���֮��һֱ����������ʹ�������
		// If the GetMask2() still reports errors , it is recommended to use GetMask().
		//cv::Mat mask_proposals;
		//for (int i = 0; i < temp_mask_proposals.size(); ++i) {
		//	mask_proposals.push_back(cv::Mat(temp_mask_proposals[i]).t());
		//}
		//GetMask(mask_proposals, mask_protos, temp_output, mask_params);
		//*****************************************************/
		output.push_back(temp_output);

	}

	if (output.size())
		return true;
	else
		return false;
}

main.cpp 在yolov8_onnx函数里面里面添加了分割出的掩膜mask合并的代码

#include 
#include
#include
#include "yolov8_seg_onnx.h"
#include
//#define  VIDEO_OPENCV //if define, use opencv for video.

using namespace std;
using namespace cv;
using namespace dnn;

template<typename _Tp>
std::vector<OutputParams> yolov8_onnx(_Tp& task, cv::Mat& img, std::string& model_path)
{

	// if (task.ReadModel(model_path, false,0,true)) {
	// 	std::cout << "read net ok!" << std::endl;
	// }
	//生成随机颜色
	std::vector<cv::Scalar> color;
	srand(time(0));
	for (int i = 0; i < 80; i++) {
		int b = rand() % 256;
		int g = rand() % 256;
		int r = rand() % 256;
		color.push_back(cv::Scalar(b, g, r));
	}
	std::vector<OutputParams> result;
	
	if (task.OnnxDetect(img, result)) {
		//std::cout<<"111"<
		DrawPred(img, result, task._className, color,false);
		// 遍历所有检测结果
		cv::Mat combinedMask = cv::Mat::zeros(img.size(), CV_8UC1);
		for (const auto& output : result) {
			// 获取当前检测框的ROI
        	cv::Mat roi = combinedMask(output.box);
			cv::Mat boxMaskBinary;
			output.boxMask.convertTo(boxMaskBinary, CV_8UC1);
			// 将当前mask合并到总mask上
			// 这里使用OR操作,可以根据需要改为其他合并方式
			cv::bitwise_or(roi, boxMaskBinary, roi);
		}
		cv::imwrite("combinedMask.png", combinedMask);

	}
	else {
		std::cout << "Detect Failed!" << std::endl;
	}
	//system("pause");
	return result;
}
int main() {
	std::string img_path = "./images/_20250609_144103.bmp";
	//std::string img_path = "../rgb/2025-05-27_08-37-46_undistort_bright.bmp";
	std::string model_path_detect = "./model/0613.onnx";
	cv::Mat src = imread(img_path);
	cv::Mat img = src.clone();
	//Yolov8Onnx			task_detect_ort;
	Yolov8SegOnnx		task_segment_ort;
	if (task_segment_ort.ReadModel(model_path_detect, false,0,true)) {
		std::cout << "read net ok!" << std::endl;
	}
	std::vector<OutputParams> results_detect;
	long long startTime = std::chrono::system_clock::now().time_since_epoch().count(); //ns
	results_detect=yolov8_onnx(task_segment_ort, img, model_path_detect); //yolov8 onnxruntime 
	long long timeNow = std::chrono::system_clock::now().time_since_epoch().count();
	double timeuse = (timeNow - startTime) * 0.000001;
	//std::cout<<"end detect"<
	std::cout << (timeNow - startTime) * 0.000001 << "ms\n";
	std::cout<<"num: "<<results_detect.size()<<endl;
	OutputParams out_result;
	// for (int i = 0; i < results_detect.size(); i++) {
		
	// 	cout<
	// }	
	cv::waitKey(0);
	return 0;
}

python版本

python部署,相对简单,修改下面模型和图片路径,直接运行就可以。

import cv2
import numpy as np
import onnxruntime as ort
import time

classes = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck',
           8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
           14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
           22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase',
           29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
           35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
           40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple',
           48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut',
           55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
           62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
           69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase',
           76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}

class Colors:
    """
    This class provides methods to work with the Ultralytics color palette, including converting hex color codes to
    RGB values.

    Attributes:
        palette (list of tuple): List of RGB color values.
        n (int): The number of colors in the palette.
        pose_palette (np.array): A specific color palette array with dtype np.uint8.
    """

    def __init__(self):
        """Initialize colors as hex = matplotlib.colors.TABLEAU_COLORS.values()."""
        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
                '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)
        self.pose_palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], [230, 230, 0], [255, 153, 255],
                                      [153, 204, 255], [255, 102, 255], [255, 51, 255], [102, 178, 255], [51, 153, 255],
                                      [255, 153, 153], [255, 102, 102], [255, 51, 51], [153, 255, 153], [102, 255, 102],
                                      [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0], [255, 255, 255]],
                                     dtype=np.uint8)

    def __call__(self, i, bgr=False):
        """Converts hex color codes to RGB values."""
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):
        """Converts hex color codes to RGB values (i.e. default PIL order)."""
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))


class YOLOv8Seg:
    """YOLOv8 segmentation model."""

    def __init__(self, onnx_model):
        """
        Initialization.

        Args:
            onnx_model (str): Path to the ONNX model.
        """

        # Build Ort session
        self.session = ort.InferenceSession(onnx_model,
                                            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
                                            if ort.get_device() == 'GPU' else ['CPUExecutionProvider'])

        # Numpy dtype: support both FP32 and FP16 onnx model
        self.ndtype = np.half if self.session.get_inputs()[0].type == 'tensor(float16)' else np.single

        # Get model width and height(YOLOv8-seg only has one input)
        self.model_height, self.model_width = [x.shape for x in self.session.get_inputs()][0][-2:]

        # Load COCO class names
        self.classes = classes

        # Create color palette
        self.color_palette = Colors()

    def __call__(self, im0, conf_threshold=0.4, iou_threshold=0.45, nm=32):
        """
        The whole pipeline: pre-process -> inference -> post-process.

        Args:
            im0 (Numpy.ndarray): original input image.
            conf_threshold (float): confidence threshold for filtering predictions.
            iou_threshold (float): iou threshold for NMS.
            nm (int): the number of masks.

        Returns:
            boxes (List): list of bounding boxes.
            segments (List): list of segments.
            masks (np.ndarray): [N, H, W], output masks.
        """

        # Pre-process
        im, ratio, (pad_w, pad_h) = self.preprocess(im0)

        # Ort inference
        preds = self.session.run(None, {self.session.get_inputs()[0].name: im})

        # Post-process
        boxes, segments, masks = self.postprocess(preds,
                                                  im0=im0,
                                                  ratio=ratio,
                                                  pad_w=pad_w,
                                                  pad_h=pad_h,
                                                  conf_threshold=conf_threshold,
                                                  iou_threshold=iou_threshold,
                                                  nm=nm)
        return boxes, segments, masks

    def preprocess(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """

        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))

        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)

    def postprocess(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold, nm=32):
        """
        Post-process the prediction.

        Args:
            preds (Numpy.ndarray): predictions come from ort.session.run().
            im0 (Numpy.ndarray): [h, w, c] original input image.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
            conf_threshold (float): conf threshold.
            iou_threshold (float): iou threshold.
            nm (int): the number of masks.

        Returns:
            boxes (List): list of bounding boxes.
            segments (List): list of segments.
            masks (np.ndarray): [N, H, W], output masks.
        """
        x, protos = preds[0], preds[1]  # Two outputs: predictions and protos

        # Transpose the first output: (Batch_size, xywh_conf_cls_nm, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls_nm)
        x = np.einsum('bcn->bnc', x)

        # Predictions filtering by conf-threshold
        x = x[np.amax(x[..., 4:-nm], axis=-1) > conf_threshold]

        # Create a new matrix which merge these(box, score, cls, nm) into one
        # For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
        x = np.c_[x[..., :4], np.amax(x[..., 4:-nm], axis=-1), np.argmax(x[..., 4:-nm], axis=-1), x[..., -nm:]]
        # NMS filtering
        x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]

        # Decode and return
        if len(x) > 0:

            # Bounding boxes format change: cxcywh -> xyxy
            x[..., [0, 1]] -= x[..., [2, 3]] / 2
            x[..., [2, 3]] += x[..., [0, 1]]

            # Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
            x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
            x[..., :4] /= min(ratio)

            # Bounding boxes boundary clamp
            x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
            x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])

            # Process masks
            masks = self.process_mask(protos[0], x[:, 6:], x[:, :4], im0.shape)

            # Masks -> Segments(contours)
            segments = self.masks2segments(masks)
            return x[..., :6], segments, masks  # boxes, segments, masks
        else:
            return [], [], []

    @staticmethod
    def masks2segments(masks):
        """
        It takes a list of masks(n,h,w) and returns a list of segments(n,xy)
        Args:
            masks (numpy.ndarray): the output of the model, which is a tensor of shape (batch_size, 160, 160).

        Returns:
            segments (List): list of segment masks.
        """
        segments = []
        for x in masks.astype('uint8'):
            c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]  # CHAIN_APPROX_SIMPLE
            if c:
                c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
            else:
                c = np.zeros((0, 2))  # no segments found
            segments.append(c.astype('float32'))
        return segments

    @staticmethod
    def crop_mask(masks, boxes):
        """
        It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
        Args:
            masks (Numpy.ndarray): [n, h, w] tensor of masks.
            boxes (Numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form.

        Returns:
            (Numpy.ndarray): The masks are being cropped to the bounding box.
        """
        n, h, w = masks.shape
        x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)
        r = np.arange(w, dtype=x1.dtype)[None, None, :]
        c = np.arange(h, dtype=x1.dtype)[None, :, None]
        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))

    def process_mask(self, protos, masks_in, bboxes, im0_shape):
        """
        Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher quality
        but is slower.
        Args:
            protos (numpy.ndarray): [mask_dim, mask_h, mask_w].
            masks_in (numpy.ndarray): [n, mask_dim], n is number of masks after nms.
            bboxes (numpy.ndarray): bboxes re-scaled to original image shape.
            im0_shape (tuple): the size of the input image (h,w,c).

        Returns:
            (numpy.ndarray): The upsampled masks.
        """
        c, mh, mw = protos.shape
        masks = np.matmul(masks_in, protos.reshape((c, -1))).reshape((-1, mh, mw)).transpose(1, 2, 0)  # HWN
        masks = np.ascontiguousarray(masks)
        masks = self.scale_mask(masks, im0_shape)  # re-scale mask from P3 shape to original input image shape
        masks = np.einsum('HWN -> NHW', masks)  # HWN -> NHW
        masks = self.crop_mask(masks, bboxes)
        return np.greater(masks, 0.5)

    @staticmethod
    def scale_mask(masks, im0_shape, ratio_pad=None):
        """
        Takes a mask, and resizes it to the original image size.
        Args:
            masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
            im0_shape (tuple): the original image shape.
            ratio_pad (tuple): the ratio of the padding to the original image.

        Returns:
            masks (np.ndarray): The masks that are being returned.
        """
        im1_shape = masks.shape[:2]
        if ratio_pad is None:  # calculate from im0_shape
            gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1])  # gain  = old / new
            pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2  # wh padding
        else:
            pad = ratio_pad[1]

        # Calculate tlbr of mask
        top, left = int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1))  # y, x
        bottom, right = int(round(im1_shape[0] - pad[1] + 0.1)), int(round(im1_shape[1] - pad[0] + 0.1))
        if len(masks.shape) < 2:
            raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
        masks = masks[top:bottom, left:right]
        masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]),
                           interpolation=cv2.INTER_LINEAR)  # INTER_CUBIC would be better
        if len(masks.shape) == 2:
            masks = masks[:, :, None]
        return masks

    def draw_and_visualize(self, im, bboxes, segments, vis=True, save=False):
        """
        Draw and visualize results.

        Args:
            im (np.ndarray): original image, shape [h, w, c].
            bboxes (numpy.ndarray): [n, 4], n is number of bboxes.
            segments (List): list of segment masks.
            vis (bool): imshow using OpenCV.
            save (bool): save image annotated.

        Returns:
            None
        """

        # Draw rectangles and polygons
        im_canvas = im.copy()
        for (*box, conf, cls_), segment in zip(bboxes, segments):
            # draw contour and fill mask
            cv2.polylines(im, np.int32([segment]), True, (255, 255, 255), 2)  # white borderline
            cv2.fillPoly(im_canvas, np.int32([segment]), self.color_palette(int(cls_), bgr=True))

            # draw bbox rectangle
            cv2.rectangle(im, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
                          self.color_palette(int(cls_), bgr=True), 1, cv2.LINE_AA)
            cv2.putText(im, f'{self.classes[cls_]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, self.color_palette(int(cls_), bgr=True), 2, cv2.LINE_AA)

        # Mix image
        im = cv2.addWeighted(im_canvas, 0.3, im, 0.7, 0)
        return im

if __name__ == '__main__':
    # 模型路径
    model_path = "yolov8n-seg.onnx"
    # 实例化模型
    model = YOLOv8Seg(model_path)
    conf = 0.35
    iou = 0.45
    # 三种模式 1为图片预测,并显示结果图片;2为摄像头检测,并实时显示FPS
    mode = 1
    # opencv 读取图片
    img = cv2.imread('street.jpg')
    # 推理
    boxes, segments, _ = model(img, conf_threshold=conf, iou_threshold=iou)
    # 画图
    if len(boxes) > 0:
        output_image = model.draw_and_visualize(img, boxes, segments, vis=False, save=True)
    else:
        output_image = img
    print("图片完成检测")
    cv2.imshow("seg", output_image)
    cv2.imwrite('image_seg.jpg', output_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

你可能感兴趣的:(超详细yolov8/11-segment实例分割全流程概述:配置环境、数据标注、训练、验证/预测、onnx部署(c++/python)详解)