1.使用OpenCV中imread()函数传入图片;
2.读取图片像素值,使用imgHeight和imgWidth储存;
3. 直方图统计像素值,对灰度直方图数组初始化;
4.在GPU中开辟输入输出空间,分配内存空间;
5.将图片数据传入GPU;
6.并行灰度转换,clock_t统计执行时间,添加cudaDeviceSynchronize()同步CPU和GPU,否则测速结果为CPU启动内核函数的时间;
7.串行灰度转换,并统计时间;
8.将数据从GPU传回CPU,imwrite()函数生成灰度图片;
9.释放内存;
10.分析实验数据和结果。
CPU串行时间约为CUDA并行方式的26.63929倍。在执行时间统计过程中,有出现cuda并行执行时间为0.0000000000的情况,但串行时间正常,并且能正常输出灰度变换后的图像,查询资料后推测可能是编译器的原因导致。在进行imwrite()灰度图像生成的过程中耗费时间较多,使用的样例图片(testpic2.png)大小为46253KB,在生成灰度图片(result.png)过程中,灰度图片的大小逐渐增加,在没有完全生成灰度图片前,图片下半部分为黑色,可以得知imwrite()为逐行写入灰度值。
[show_image.cu]
#include "cuda_runtime.h"
#include
#include
#include
#include
#include "opencv2/highgui.hpp"
#include
#include
using namespace std;
using namespace cv;
//输入图像为BGR图,将其转化为gray图
__global__ void rgb2grayInCuda(uchar3 *dataIn, unsigned char *dataOut, int imgHeight, int imgWidth)
{
//图片二维扫描,分别有x方向,y方向的像素点
int xIndex = threadIdx.x + blockIdx.x * blockDim.x; //表示x方向上的ID
int yIndex = threadIdx.y + blockIdx.y * blockDim.y; //表示y方向上的ID
//灰度变换操作
if (xIndex < imgWidth && yIndex < imgHeight)
{
uchar3 rgb = dataIn[yIndex * imgWidth + xIndex];
dataOut[yIndex * imgWidth + xIndex] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
}
}
//串行转换灰度图像
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,uint imgheight, uint imgwidth)
{
//使用两重循环嵌套实现x方向 y方向的变换
for(int i = 0; i < imgheight; i++)
{
for(int j = 0; j < imgwidth; j++)
{
d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
+ 0.587f * d_in[(i * imgwidth + j)*3 + 1]
+ 0.114f * d_in[(i * imgwidth + j)*3 + 2];
}
}
}
//灰度直方图统计
__global__ void imHistInCuda(unsigned char *dataIn, int *hist)
{
int threadIndex = threadIdx.x + threadIdx.y * blockDim.x;
int blockIndex = blockIdx.x + blockIdx.y * gridDim.x;
int index = threadIndex + blockIndex * blockDim.x * blockDim.y;
atomicAdd(&hist[dataIn[index]], 1);
//多个thread有序地对*dataIn地址加1
//如果使用自加(++),会出现多个threads同步写竞争,造成数据出错
}
int main()
{
//传入图片
Mat srcImg = imread("testpic2.png");
FILE* fp;//创建运行时间文件
//读取图片像素值
int imgHeight = srcImg.rows;
int imgWidth = srcImg.cols;
Mat grayImg(imgHeight, imgWidth, CV_8UC1, Scalar(0)); //输出灰度图
int hist[256]; //灰度直方图统计数组
memset(hist, 0, 256 * sizeof(int)); //对灰度直方图数组初始化为0
//在GPU中开辟输入输出空间
uchar3 *d_in;
unsigned char *d_out;
int *d_hist;
//分配内存空间
cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));
cudaMalloc((void**)&d_hist, 256 * sizeof(int));
//将图像数据传入GPU中
cudaMemcpy(d_in, srcImg.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);
cudaMemcpy(d_hist, hist, 256 * sizeof(int), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//统计时间
clock_t start, end;
start = clock();
//cuda灰度化
rgb2grayInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth);
cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
end = clock();
double gputime =(double)(end-start)/CLOCKS_PER_SEC;
//打印cuda并行执行时间
printf("cuda exec time is %.20lf\n", gputime);
//灰度直方图统计
imHistInCuda << <blocksPerGrid, threadsPerBlock >> >(d_out, d_hist);
//将数据从GPU传回CPU
cudaMemcpy(hist, d_hist, 256 * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(grayImg.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);
vector<int> compression_params;
compression_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
compression_params.push_back(9);
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_hist);
//串行灰度化
start = clock();
rgb2grayincpu(srcImg.data, grayImg.data, imgHeight, imgWidth);
end = clock();
double cputime =(double)(end-start)/CLOCKS_PER_SEC;
//打印串行执行时间
printf("cpu exec time is %.20lf\n",cputime );
//将串行、并行执行时间记录到文件中,方便查看比对
fp = fopen("time.txt","a");
fprintf(fp,"cpu exec time is %.20lf s , cuda exec time is %.20lf s \n", cputime, gputime);
fclose(fp);
try
{
imwrite("result.png",grayImg,compression_params);
//在build文件夹中,生成灰度变换后的结果图片
}
catch (runtime_error& ex)
{
fprintf(stderr, "图像转换成PNG格式发生错误:%s\n", ex.what());
return 1;
}
return 0;
}
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(display_image)
FIND_PACKAGE(OpenCV REQUIRED)
FIND_PACKAGE(CUDA REQUIRED)
CUDA_ADD_EXECUTABLE(display_image show_image.cu)
TARGET_LINK_LIBRARIES(display_image ${
OpenCV_LIBS})