一 远程登录Linux Server运行CUDA程序(摘自国科大刘莹老师给的指南)
1 Window系统下remote_username@remote_host_ip's password: *******
二 运行cuda程序
1 获取硬件平台设备属性
CPU properties:cat /proc/cpuinfo
Memory properties: cat /proc/meminfo
GPUproperties:
nvcc devicepro.cu
./a.out
其中获取GPUproperties需要编写代码获取结构体cudaDeviceProp的信息。代码如下,保存为*.cu运行即可。
#include
#include
using namespace std;
int main()
{
cudaDeviceProp prop;
int count;
cudaGetDeviceCount(&count);
for(int i = 0 ; i < count ; i++)
{
cudaGetDeviceProperties(&prop,i);
cout<<"the information for the device : "<
----------------------------------------------------------------------------------------------------------------------------------------
main GPU properties(# nvcc devicepro.cu # ./a.out)
name:Tesla K10.G2.8GB
total global memory:3757637632
total constant memory:65536
shared Memory Per Block:49152
register Per Block:65536
threads in warps:32
max threads per block:1024
max threads dims:1024 1024 64
max grid dims:2147483647 65535 65535
number of Processors: 8
-----------------------------------------------------------------------------------------------------------------------------------------
2 简单的cuda程序之ReverseArray(dimA= 256 * 1024, numThreadsPerBlock = 256)
2.1 dimGrid(1024);dimBlock(256); no sharedmemory;
// includes, system
#include
#include
#include
#include "cuda_runtime.h"
using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in, int dimA)
{
int tid = blockIdx.x*256 + threadIdx.x;
d_out[dimA-tid-1] = d_in[tid];
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);
// pointer for host memory and size
int *h_a;
int dimA = 256 * 1024; // 256K elements (1MB total)
// pointer for device memory
int *d_b, *d_a;
// define grid and block size
int numThreadsPerBlock = 256;
// Compute number of blocks needed based on array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// Part 1 of 2: Compute the number of bytes of shared memory needed
// This is used in the kernel invocation below
// int sharedMemSize = numThreadsPerBlock*sizeof(int);
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a ,dimA);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
assert(h_a[i] == dimA - 1 - i );
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
cout << "elapsedTime " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
//runing time 4.03104ms
2.2 dimGrid(1024); dimBlock(256); sharedMemSize = 256*sizeof(int);
// includes, system
#include
#include
#include
#include "cuda_runtime.h"
using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in, int dimA)
{
int tid = blockIdx.x*256 + threadIdx.x;
__shared__ int Arr[256];
Arr[threadIdx.x] = d_in[tid];
__syncthreads();
d_out[dimA-tid-1] = Arr[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);
// pointer for host memory and size
int *h_a;
int dimA = 256 * 1024; // 256K elements (1MB total)
// pointer for device memory
int *d_b, *d_a;
// define grid and block size
int numThreadsPerBlock = 256;
// Compute number of blocks needed based on array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// Part 1 of 2: Compute the number of bytes of shared memory needed
// This is used in the kernel invocation below
int sharedMemSize = numThreadsPerBlock*sizeof(int);
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock, sharedMemSize >>>( d_b, d_a ,dimA);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
assert(h_a[i] == dimA - 1 - i );
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
cout << "elapsedTime " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
//runing time 3.99392ms.
3 简单的cuda程序之ParallelAdd(N(2048*2048), THREADS_PER_BLOCK 512)
3.1 dimGrid(32,64); dimBlock(16, 32); no sharedmemory;
#include
#include
#include
#define N (2048*2048)
#define THREADS_PER_BLOCK 512 //16*32
using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
//Please implement the kernel function Add().
__global__ void add(int* dev_a, int* dev_b, int* dev_c)
{
int tid = threadIdx.y*32 + threadIdx.x;
int bid = blockIdx.y*64 + blockIdx.x;
dev_c[bid*512+tid] = dev_a[bid*512+tid] + dev_b[bid*512+tid];
}
int main( void ) {
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
for (int i = 0; i < N; ++i){
a[i] = rand();
b[i] = rand();
}
//random_ints( a, N );
//random_ints( b, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with blocks and threads
dim3 dimGrid(32, 64);
dim3 dimBlock(16, 32);
add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( dev_a, dev_b, dev_c);
//add<<< dimGrid, dimBlock >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("memcpy");
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
cout << "elapsedTime " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
//237.658ms
3.2 dimGrid(32,64); dimBlock(16, 32); sharedMemSize = 16*32*sizeof(int);
#include
#include
#include
#define N (2048*2048)
#define THREADS_PER_BLOCK 512 //16*32
using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
//Please implement the kernel function Add().
__global__ void add(int* dev_a, int* dev_b, int* dev_c)
{
int tid = threadIdx.y*32 + threadIdx.x;
int bid = blockIdx.y*64 + blockIdx.x;
__shared__ int A[16][32];
__shared__ int B[16][32];
A[threadIdx.y][threadIdx.x] = dev_a[bid*512+tid];
B[threadIdx.y][threadIdx.x] = dev_b[bid*512+tid];
__syncthreads();
dev_c[bid*512+tid] = A[threadIdx.y][threadIdx.x] + B[threadIdx.y][threadIdx.x];
}
int main( void ) {
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
for (int i = 0; i < N; ++i){
a[i] = rand();
b[i] = rand();
}
//random_ints( a, N );
//random_ints( b, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with blocks and threads
dim3 dimGrid(32, 64);
dim3 dimBlock(16, 32);
int sharesize = 16*32*sizeof(int);
add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK, sharesize >>>( dev_a, dev_b, dev_c);
//add<<< dimGrid, dimBlock >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("memcpy");
/*
// verify the data returned to the host is correct
for (int i = 0; i < N; i++)
{
assert(c[i] == a[i]+b[i] );
}*/
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
cout << "elapsedTime " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
// 235.297ms
kernel中share memory 可以适当取大一点,一个SM不超过16KB即可。
ReverseArray中,The size of array A is 256*1024 which issmaller than the maximun threads per grid can hold which is 1024*65535. Therefore,we can start 256*1024 threads, one for a single element in A.
We use 256 threads per block as suggested. Inthis way, one SM can hold 3 blocks which is 768 threads.
According to the GPU run time on differentdimGrid and dimBlock, we can conclude that GPU run time has nothing to do withthe shape and dimension of Grid and Block. Moreover, data in shared memory isnot overused in this case, so GPU run time is not decreased when we use sharedmemory.
Becauseof the small amount of data , cuda do not show the superiority over CPU.
5 参考资料
NVIDIA CUDA: http://www.nvidia.cn/object/cuda-cn.html
cuda检验执行时间:http://www.cnblogs.com/lopezycj/archive/2011/08/09/cuda_time.html
通过cuda的sdk学习makefile: http://www.cnblogs.com/FreeAquar/archive/2012/04/03/2430860.html
linux下命令行gdb调试命令:http://blog.csdn.net/dadalan/article/details/3758025