将onnx文件导出为engine,FP16格式
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
# 加载ONNX文件
onnx_file_path = 'model.onnx'
engine_file_path = 'model_tesfp16.trt'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1)
parser = trt.OnnxParser(network, TRT_LOGGER)
# 解析ONNX文件
with open(onnx_file_path, 'rb') as f:
data = f.read()
parser.parse(data)
# 构建TensorRT引擎
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 4*(1 << 30)
builder_config.set_flag(trt.BuilderFlag.FP16)
engine = builder.build_engine(network, builder_config)
# 构建TensorRT引擎
# builder_config = builder.create_builder_config()
# builder_config.max_workspace_size = 1 << 30
# # builder_config.max_batch_size = 1 # 设置最大批量大小
# builder_config.set_flag(trt.BuilderFlag.FP16)
# # builder_config.set_flag(trt.BuilderFlag.INT8)
# engine = builder.build_engine(network, builder_config)
# 保存TensorRT引擎到文件
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
以cyclegan网络,分别用python 、C++对网络进行推理
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
import cv2
# 加载TRT引擎
# engine_file_path = 'model_fp16.trt'
engine_file_path = 'model_tesfp16.trt'
with open(engine_file_path, 'rb') as f:
engine_data = f.read()
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)
# 创建执行上下文
context = engine.create_execution_context()
# 分配内存
# 创建输入和输出缓冲区
# 分配输入和输出内存
input_shape = (1, 1, 512, 512) # 输入数据的形状 如果是三通道(1,3,512,512)
output_shape = (1, 1,512,512) # 输出数据的形状 如果是三通道(1,3,512,512)
# input_data = np.random.randn(*input_shape).astype(np.float32)
input_data = cv2.imread("image1644.png",0)
input_data = input_data.reshape((1,1,512, 512,)).astype(np.float32) 如果是三通道(1,3,512,512)
output_data = np.empty(output_shape, dtype=np.float32)
# 在GPU上分配内存
d_input = cuda.mem_alloc(input_data.nbytes)
d_output = cuda.mem_alloc(output_data.nbytes)
# 创建CUDA流
stream = cuda.Stream()
# 将输入数据从主机内存复制到GPU内存
cuda.memcpy_htod_async(d_input, input_data, stream)
# 执行TensorRT推理
T1 = time.time()
bindings = [int(d_input), int(d_output)]
stream_handle = stream.handle
context.execute_async_v2(bindings=bindings, stream_handle=stream_handle)
# 将输出数据从GPU内存复制到主机内存
cuda.memcpy_dtoh_async(output_data, d_output, stream)
# 等待推理完成
stream.synchronize()
T2 = time.time()
print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))
# 打印输出结果
print(type(output_data))
a_sque = np.squeeze(output_data)
a_sque =-a_sque*255
a_sque = 255 -a_sque
# print(a_sque)
# img = cv2.cvtColor(a_sque, cv2.COLOR_GRAY2BGR)
cv2.imwrite("tensorrt_ilubuntu.jpg",a_sque)
print("output_data = ",output_data)
C++版本首先配置cmakelists
cmake_minimum_required(VERSION 2.6)
project(cycle_gan)
add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
# set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Release)
find_package(CUDA REQUIRED)
enable_language(CUDA)
include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda-11.6/include)
link_directories(/usr/local/cuda-11.6/lib64)
# tensorrt
include_directories(/home/mao/bag/TensorRT-8.2.5.1/include/)
link_directories(/home/mao/bag/TensorRT-8.2.5.1/lib/)
# include_directories(/usr/include/x86_64-linux-gnu/)
# link_directories(/usr/lib/x86_64-linux-gnu/)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
cuda_add_executable(cycle_gan main.cpp )
#cuda_add_library(yolov5 SHARED ${PROJECT_SOURCE_DIR}/yolov5.cpp ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yololayer.h ${PROJECT_SOURCE_DIR}/preprocess.cu)
target_link_libraries(cycle_gan nvonnxparser nvinfer nvinfer_plugin)
target_link_libraries(cycle_gan cudart)
target_link_libraries(cycle_gan ${OpenCV_LIBS})
if(UNIX)
add_definitions(-O2 -pthread)
endif(UNIX)
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include
#include
#include