这个CUDA工程包括了三个文件,一个是makefile;一个是vector_add.cu,用于联系主机端与设备端;最后是vector_add_kernel.cu,这个就是设备端要执行的真正的CUDA程序。
首先看看makefile:
# Add source files here EXECUTABLE := zenny_basic # Cuda source files (compiled with cudacc) CUFILES := vector_add.cu # C/C++ source files (compiled with gcc / c++) CFILES := / CFLAGS := ################################################################################ # Rules and targets include ../../common/common.mk
上述文件中指定了一个vector_add.cu作为要用nvcc编译的源文件,而生成的可执行文件名是zenny_basic。
下面看看vector_add_kernel.cu文件:
#ifndef _MATRIXMUL_KERNEL_H_ #define _MATRIXMUL_KERNEL_H_ __global__ void vector_add(int *vecA, int *vecB, int *outC) { // current thread ID int tIdx = threadIdx.x + blockIdx.x * blockDim.x; outC[tIdx] = vecA[tIdx] + vecB[tIdx]; } #endif // #ifndef _MATRIXMUL_KERNEL_H_
上述代码很清楚,是将vecA与vecB的元素相加后,将结果给outC。
下面看看vector_add.cu:
// Utilities and system includes #include <shrUtils.h> #include <cutil_inline.h> #include <stdio.h> // includes, kernels #include "vector_add_kernel.cu" //////////////////////////////////////////////////////////////////////////////// // declaration, forward static void runTest(int argc, char** argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { puts("[ vector add ]"); runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char** argv) { cudaSetDevice(cutGetMaxGflopsDeviceId()); int devID; cudaDeviceProp props; // get number of SMs on this GPU cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); printf("Device %d: /"%s/" with Compute %d.%d capability/n", devID, props.name, props.major, props.minor); // allocate host memory int *pHostSrc1 = (int*)malloc(sizeof(*pHostSrc1) * 128); int *pHostSrc2 = (int*)malloc(sizeof(*pHostSrc2) * 128); int *pHostDst = (int*)malloc(sizeof(*pHostDst) * 128); // allocate device memory int *pDeviceSrc1; int *pDeviceSrc2; int *pDeviceDst; cutilSafeCall(cudaMalloc((void**)&pDeviceSrc1, sizeof(*pDeviceSrc1) * 128)); cutilSafeCall(cudaMalloc((void**)&pDeviceSrc2, sizeof(*pDeviceSrc2) * 128)); cutilSafeCall(cudaMalloc((void**)&pDeviceDst, sizeof(*pDeviceDst) * 128)); // Initialize host memory for(int i=0; i<128; i++) { pHostSrc1[i] = pHostSrc2[i] = i; } // copy host memory to device cutilSafeCall(cudaMemcpy(pDeviceSrc1, pHostSrc1, sizeof(*pDeviceSrc1) * 128, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMemcpy(pDeviceSrc2, pHostSrc2, sizeof(*pDeviceSrc2) * 128, cudaMemcpyHostToDevice) ); // kernel warmup vector_add<<< 8, 16 >>>(pDeviceSrc1, pDeviceSrc2, pDeviceDst); cudaThreadSynchronize(); // create and start timer printf("Run Kernels.../n/n"); // 8 blocks and 16 threads for each block vector_add<<< 8, 16 >>>(pDeviceSrc1, pDeviceSrc2, pDeviceDst); // check if kernel execution generated and error cutilCheckMsg("Kernel execution failed"); cudaThreadSynchronize(); // copy result from device to host cutilSafeCall(cudaMemcpy(pHostDst, pDeviceDst, sizeof(*pDeviceDst) * 128, cudaMemcpyDeviceToHost) ); // Output the result for(int i=0; i<128; i++) { if((i & 15) == 0) puts(""); printf("%d ", pHostDst[i]); } // clean up memory free(pHostSrc1); free(pHostSrc2); free(pHostDst); cutilSafeCall(cudaFree(pDeviceSrc1)); cutilSafeCall(cudaFree(pDeviceSrc2)); cutilSafeCall(cudaFree(pDeviceDst)); cudaThreadExit(); }