CUDA进行RGB图像灰度转换

CUDA进行RGB图像灰度转换

[设计流程]

1.使用OpenCV中imread()函数传入图片;
2.读取图片像素值,使用imgHeight和imgWidth储存;
3. 直方图统计像素值,对灰度直方图数组初始化;
4.在GPU中开辟输入输出空间,分配内存空间;
5.将图片数据传入GPU;
6.并行灰度转换,clock_t统计执行时间,添加cudaDeviceSynchronize()同步CPU和GPU,否则测速结果为CPU启动内核函数的时间;
7.串行灰度转换,并统计时间;
8.将数据从GPU传回CPU,imwrite()函数生成灰度图片;
9.释放内存;
10.分析实验数据和结果。

[实验数据]

CUDA进行RGB图像灰度转换_第1张图片
CUDA进行RGB图像灰度转换_第2张图片
执行时间比对:
CUDA进行RGB图像灰度转换_第3张图片
CUDA进行RGB图像灰度转换_第4张图片
下图为未完成的灰度图像,需要继续等待:
CUDA进行RGB图像灰度转换_第5张图片

[实验结果及分析]

CPU串行时间约为CUDA并行方式的26.63929倍。在执行时间统计过程中,有出现cuda并行执行时间为0.0000000000的情况,但串行时间正常,并且能正常输出灰度变换后的图像,查询资料后推测可能是编译器的原因导致。在进行imwrite()灰度图像生成的过程中耗费时间较多,使用的样例图片(testpic2.png)大小为46253KB,在生成灰度图片(result.png)过程中,灰度图片的大小逐渐增加,在没有完全生成灰度图片前,图片下半部分为黑色,可以得知imwrite()为逐行写入灰度值。

[源代码]

[show_image.cu]

#include "cuda_runtime.h"
#include 
#include 
#include 
#include 
#include  "opencv2/highgui.hpp"
#include 
#include 
using namespace std;
using namespace cv;

//输入图像为BGR图,将其转化为gray图
__global__ void rgb2grayInCuda(uchar3 *dataIn, unsigned char *dataOut, int imgHeight, int imgWidth)
{
     
    //图片二维扫描,分别有x方向,y方向的像素点
    int xIndex = threadIdx.x + blockIdx.x * blockDim.x;	//表示x方向上的ID
    int yIndex = threadIdx.y + blockIdx.y * blockDim.y;	//表示y方向上的ID
    //灰度变换操作
    if (xIndex < imgWidth && yIndex < imgHeight)
    {
     
        uchar3 rgb = dataIn[yIndex * imgWidth + xIndex];
        dataOut[yIndex * imgWidth + xIndex] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
    }
}
//串行转换灰度图像
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,uint imgheight, uint imgwidth)
{
     
    //使用两重循环嵌套实现x方向 y方向的变换
    for(int i = 0; i < imgheight; i++)
    {
     
        for(int j = 0; j < imgwidth; j++)
        {
     
            d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
                                     + 0.587f * d_in[(i * imgwidth + j)*3 + 1]
                                     + 0.114f * d_in[(i * imgwidth + j)*3 + 2];
        }
    }
}

//灰度直方图统计
__global__ void imHistInCuda(unsigned char *dataIn, int *hist)
{
     
    int threadIndex = threadIdx.x + threadIdx.y * blockDim.x;
    int blockIndex = blockIdx.x + blockIdx.y * gridDim.x;
    int index = threadIndex + blockIndex * blockDim.x * blockDim.y;

    atomicAdd(&hist[dataIn[index]], 1);
	//多个thread有序地对*dataIn地址加1
        //如果使用自加(++),会出现多个threads同步写竞争,造成数据出错
}

int main()
{
     
    //传入图片
    Mat srcImg = imread("testpic2.png");
    FILE* fp;//创建运行时间文件

    //读取图片像素值
    int imgHeight = srcImg.rows;  
    int imgWidth = srcImg.cols;  

    Mat grayImg(imgHeight, imgWidth, CV_8UC1, Scalar(0));	//输出灰度图
    int hist[256];	//灰度直方图统计数组
    memset(hist, 0, 256 * sizeof(int));	//对灰度直方图数组初始化为0
	
    //在GPU中开辟输入输出空间
    uchar3 *d_in;
    unsigned char *d_out;
    int *d_hist;

    //分配内存空间
    cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));
    cudaMalloc((void**)&d_hist, 256 * sizeof(int));

    //将图像数据传入GPU中
    cudaMemcpy(d_in, srcImg.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);
    cudaMemcpy(d_hist, hist, 256 * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //统计时间
    clock_t start, end;

    start = clock();
    //cuda灰度化
    rgb2grayInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth);
	
    cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
    end = clock();
    double gputime =(double)(end-start)/CLOCKS_PER_SEC;

    //打印cuda并行执行时间
    printf("cuda exec time is %.20lf\n", gputime);

    //灰度直方图统计
    imHistInCuda << <blocksPerGrid, threadsPerBlock >> >(d_out, d_hist);

    //将数据从GPU传回CPU
    cudaMemcpy(hist, d_hist, 256 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(grayImg.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);
	
	vector<int> compression_params;
	compression_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
	compression_params.push_back(9);
	cudaFree(d_in);
    	cudaFree(d_out);
    	cudaFree(d_hist);

    //串行灰度化
    start = clock();
    rgb2grayincpu(srcImg.data, grayImg.data, imgHeight, imgWidth);
    end = clock();

    double cputime =(double)(end-start)/CLOCKS_PER_SEC;
    
    //打印串行执行时间
    printf("cpu exec time is %.20lf\n",cputime );

    //将串行、并行执行时间记录到文件中,方便查看比对
    fp = fopen("time.txt","a");
    fprintf(fp,"cpu exec time is %.20lf s , cuda exec time is %.20lf s \n", cputime, gputime);
    fclose(fp);
	try  
   	{
       
                imwrite("result.png",grayImg,compression_params);
		//在build文件夹中,生成灰度变换后的结果图片  
    	}  
    	catch (runtime_error& ex)  
    	{
       
        	fprintf(stderr, "图像转换成PNG格式发生错误:%s\n", ex.what());  
        	return 1;  
    	}  
    return 0;
}

[CMakeLists.txt]

CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(display_image)
FIND_PACKAGE(OpenCV REQUIRED)
FIND_PACKAGE(CUDA REQUIRED)
CUDA_ADD_EXECUTABLE(display_image show_image.cu)
TARGET_LINK_LIBRARIES(display_image ${
     OpenCV_LIBS})

你可能感兴趣的:(并行程序设计,并行计算)