SSH远程登陆Linux Server运行cuda程序

一 远程登录Linux Server运行CUDA程序(摘自国科大刘莹老师给的指南)

1 Window系统下
* 下载SSHSecureShellClient-3.2.9.exe
* 安装SSH:默认在每一步均选next
* 安装生成桌面文件:
SSH Secure Shell Client:客户端登录程序
SSH Secure File Transfer Client:文件上传下载程序
* 登录Linux Server
1) 单击Connect(或Quick Connect)
2) 输入Host Name、User Name
3) 输入Password

* 退出
输入"exit"(或点击Disconnect)


2 使用SSH Secure File Transfer Client传输源程序

3 使用CUDA编译运行程序
* 认识CUDA编译器nvcc:man nvcc
* 四种模式的编译:
Release Mode: CUDA kernel 在真正的GPU上运行
命令示例:nvcc -o executable
Debug Mode: CUDA kernel 在GPU上运行但不可调试,CPU部分代码可以调试
命令示例:nvcc -g

* 使用-keep命令选项保存中间生成文件:nvcc -keep .cu
* 使用-clean命令选项清除相应命令生成的所有文件: nvcc -keep .cu -clean
* nvcc其他常用命令选项有:-o(指定输出可执行文件名)、-D(定义宏)、-I(指定头文件搜索路径)、-include(指定包含的头文件)、-L(指定库文件搜索路径)、-l(指定链接使用的库文件)、-host-compilation(指定以C或C++语言编译CPU部分代码)-Xptxas –v(查看kernel的register、shared memory、 local memory使用情况)
详细说明请参阅nvidia提供的nvcc手册或man命令。
* 程序执行:在包含编译生成的可执行文件目录下输入:./


4 Linux系统下远程登录Linux Server
* 检查系统是否安装有SSH:(或man ssh)
rpm -qa | grep ssh
cd ~
more install.log
查找ssh
* 终端登录Linux Server
1)输入用户名和IP地址:ssh username@*.*.*.*
2)输入password
* 正常终端命令行操作
* 退出
输入"exit"


5 Linux系统下使用scp命令传输文件
* 退出ssh或切换到另一个新的终端
* 复制文件到远程主机:
输入scp命令:
scp /localfile remote_username@remote_host_ip://
按提示输入远程server上的用户密码:
remote_username@remote_host_ip's password: *******
* 从远程主机下载文件:
输入scp命令:
scp remote_username@remote_host_ip://remote_file /
按提示输入远程server上的用户密码:

remote_username@remote_host_ip's password: *******


二 运行cuda程序

1 获取硬件平台设备属性

CPU properties:cat /proc/cpuinfo

Memory properties: cat /proc/meminfo

GPUproperties:

nvcc devicepro.cu  

./a.out

其中获取GPUproperties需要编写代码获取结构体cudaDeviceProp的信息。代码如下,保存为*.cu运行即可。

#include 
#include 
using namespace std;

int main()
{
	cudaDeviceProp prop;

	int count;
	cudaGetDeviceCount(&count);

	for(int i = 0 ; i < count ; i++)
	{
		cudaGetDeviceProperties(&prop,i);
		cout<<"the information for the device : "<

----------------------------------------------------------------------------------------------------------------------------------------

main GPU properties(# nvcc devicepro.cu # ./a.out)
name:Tesla K10.G2.8GB
total global memory:3757637632
total constant memory:65536
shared Memory Per Block:49152
register Per Block:65536
threads in warps:32
max threads per block:1024
max threads dims:1024 1024 64
max grid dims:2147483647 65535 65535
number of Processors: 8

-----------------------------------------------------------------------------------------------------------------------------------------

2 简单的cuda程序之ReverseArray(dimA= 256 * 1024, numThreadsPerBlock = 256)

2.1 dimGrid(1024);dimBlock(256); no sharedmemory;

// includes, system
#include 
#include 
#include 
#include "cuda_runtime.h"

using namespace std;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in, int dimA)
{ 

    int tid = blockIdx.x*256 + threadIdx.x;

    d_out[dimA-tid-1] = d_in[tid];

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
	  cudaEvent_t timeStartEvent,timeEndEvent;
	  cudaEventCreate( &timeStartEvent, 0);
	  cudaEventCreate(&timeEndEvent, 0);
	  cudaEventRecord( timeStartEvent, 0);
	  
    // pointer for host memory and size
    int *h_a;
    int dimA = 256 * 1024; // 256K elements (1MB total)

    // pointer for device memory
    int *d_b, *d_a;

    // define grid and block size
    int numThreadsPerBlock = 256;

    // Compute number of blocks needed based on array size and desired block size
    int numBlocks = dimA / numThreadsPerBlock;  

    // Part 1 of 2: Compute the number of bytes of shared memory needed
    // This is used in the kernel invocation below
 //   int sharedMemSize = numThreadsPerBlock*sizeof(int);

    // allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);
    cudaMalloc( (void **) &d_a, memSize );
    cudaMalloc( (void **) &d_b, memSize );

    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a[i] = i;
    }

    // Copy host array to device array
    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

    // launch kernel
    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);
    reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a ,dimA);

    // block until the device has completed
    cudaThreadSynchronize();

    // check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");
    
    // device to host copy
    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
    

    // Check for any CUDA errors
    checkCUDAError("memcpy");

     
    // verify the data returned to the host is correct
    for (int i = 0; i < dimA; i++)
    {
        assert(h_a[i] == dimA - 1 - i );
    }

    // free device memory
    cudaFree(d_a);
    cudaFree(d_b);

    // free host memory
    free(h_a);

    // If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");
    
		cudaEventRecord( timeEndEvent, 0) ;
		cudaEventSynchronize( timeEndEvent ) ;
		float elapsedTime = 0 ;
		cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
		cout << "elapsedTime  " << elapsedTime << " ms. ";
		cudaEventDestroy( timeStartEvent ) ;
		cudaEventDestroy( timeEndEvent ) ;

    return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}

//runing time 4.03104ms

2.2 dimGrid(1024); dimBlock(256); sharedMemSize = 256*sizeof(int);

// includes, system
#include 
#include 
#include 
#include "cuda_runtime.h"

using namespace std;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in, int dimA)
{ 
    int tid = blockIdx.x*256 + threadIdx.x;
    __shared__ int Arr[256];
    Arr[threadIdx.x] = d_in[tid];
		__syncthreads();
    d_out[dimA-tid-1] = Arr[threadIdx.x];

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
	  cudaEvent_t timeStartEvent,timeEndEvent;
	  cudaEventCreate( &timeStartEvent, 0);
	  cudaEventCreate(&timeEndEvent, 0);
	  cudaEventRecord( timeStartEvent, 0);
	  
    // pointer for host memory and size
    int *h_a;
    int dimA = 256 * 1024; // 256K elements (1MB total)

    // pointer for device memory
    int *d_b, *d_a;

    // define grid and block size
    int numThreadsPerBlock = 256;

    // Compute number of blocks needed based on array size and desired block size
    int numBlocks = dimA / numThreadsPerBlock;  

    // Part 1 of 2: Compute the number of bytes of shared memory needed
    // This is used in the kernel invocation below
    int sharedMemSize = numThreadsPerBlock*sizeof(int);

    // allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);
    cudaMalloc( (void **) &d_a, memSize );
    cudaMalloc( (void **) &d_b, memSize );

    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a[i] = i;
    }

    // Copy host array to device array
    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

    // launch kernel
    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);
    reverseArrayBlock<<< dimGrid, dimBlock, sharedMemSize >>>( d_b, d_a ,dimA);

    // block until the device has completed
    cudaThreadSynchronize();

    // check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");
    
    // device to host copy
    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
    

    // Check for any CUDA errors
    checkCUDAError("memcpy");

     
    // verify the data returned to the host is correct
    for (int i = 0; i < dimA; i++)
    {
        assert(h_a[i] == dimA - 1 - i );
    }

    // free device memory
    cudaFree(d_a);
    cudaFree(d_b);

    // free host memory
    free(h_a);

    // If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");
    
		cudaEventRecord( timeEndEvent, 0) ;
		cudaEventSynchronize( timeEndEvent ) ;
		float elapsedTime = 0 ;
		cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
		cout << "elapsedTime  " << elapsedTime << " ms. ";
		cudaEventDestroy( timeStartEvent ) ;
		cudaEventDestroy( timeEndEvent ) ;

    return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}


//runing time 3.99392ms.

3 简单的cuda程序之ParallelAdd(N(2048*2048), THREADS_PER_BLOCK 512)

3.1 dimGrid(32,64); dimBlock(16, 32); no sharedmemory;

#include 
#include 
#include 
#define N (2048*2048)
#define THREADS_PER_BLOCK 512  //16*32

using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

//Please implement the kernel function Add().
__global__ void add(int* dev_a, int* dev_b, int* dev_c)
{
    int tid = threadIdx.y*32 + threadIdx.x;
		int bid = blockIdx.y*64 + blockIdx.x;	
		
		dev_c[bid*512+tid] = dev_a[bid*512+tid] + dev_b[bid*512+tid];
}	
int main( void ) {
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);

int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );


for (int i = 0; i < N; ++i){
    a[i] = rand();
    b[i] = rand();
}

//random_ints( a, N );
//random_ints( b, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with blocks and threads
dim3 dimGrid(32, 64);
dim3 dimBlock(16, 32);
add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( dev_a, dev_b, dev_c);
//add<<< dimGrid, dimBlock >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c

// check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");
    
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

 // Check for any CUDA errors
    checkCUDAError("memcpy");
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);

// If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");
    
		cudaEventRecord( timeEndEvent, 0) ;
		cudaEventSynchronize( timeEndEvent ) ;
		float elapsedTime = 0 ;
		cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
		cout << "elapsedTime  " << elapsedTime << " ms. ";
		cudaEventDestroy( timeStartEvent ) ;
		cudaEventDestroy( timeEndEvent ) ;
		
return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}

//237.658ms

3.2 dimGrid(32,64); dimBlock(16, 32); sharedMemSize = 16*32*sizeof(int);

#include 
#include 
#include 
#define N (2048*2048)
#define THREADS_PER_BLOCK 512  //16*32

using namespace std;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

//Please implement the kernel function Add().
__global__ void add(int* dev_a, int* dev_b, int* dev_c)
{
		int tid = threadIdx.y*32 + threadIdx.x;
		int bid = blockIdx.y*64 + blockIdx.x;		
		
		__shared__ int A[16][32];
		__shared__ int B[16][32];
		
		A[threadIdx.y][threadIdx.x] = dev_a[bid*512+tid];
		B[threadIdx.y][threadIdx.x] = dev_b[bid*512+tid];
		
		__syncthreads();
		
		dev_c[bid*512+tid] = A[threadIdx.y][threadIdx.x] + B[threadIdx.y][threadIdx.x];

}	
int main( void ) {
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);

int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );


for (int i = 0; i < N; ++i){
    a[i] = rand();
    b[i] = rand();
}

//random_ints( a, N );
//random_ints( b, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with blocks and threads
dim3 dimGrid(32, 64);
dim3 dimBlock(16, 32);
int sharesize = 16*32*sizeof(int);
add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK, sharesize >>>( dev_a, dev_b, dev_c);
//add<<< dimGrid, dimBlock >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c

// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
    
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

// Check for any CUDA errors
checkCUDAError("memcpy");
/*
// verify the data returned to the host is correct
for (int i = 0; i < N; i++)
{
    assert(c[i] == a[i]+b[i] );
}*/

free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);

// If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");
    
		cudaEventRecord( timeEndEvent, 0) ;
		cudaEventSynchronize( timeEndEvent ) ;
		float elapsedTime = 0 ;
		cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
		cout << "elapsedTime  " << elapsedTime << " ms. ";
		cudaEventDestroy( timeStartEvent ) ;
		cudaEventDestroy( timeEndEvent ) ;
		
return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}

// 235.297ms

4 分析

kernel中share memory 可以适当取大一点,一个SM不超过16KB即可。

ReverseArray中,The size of array A is 256*1024 which issmaller than the maximun threads per grid can hold which is 1024*65535. Therefore,we can start 256*1024 threads, one for a single element in A.

We use 256 threads per block as suggested. Inthis way, one SM can hold 3 blocks which is 768 threads.

According to the GPU run time on differentdimGrid and dimBlock, we can conclude that GPU run time has nothing to do withthe shape and dimension of Grid and Block. Moreover, data in shared memory isnot overused in this case, so GPU run time is not decreased when we use sharedmemory.

Becauseof the small amount of data , cuda do not show the superiority over CPU.

5 参考资料

NVIDIA CUDA: http://www.nvidia.cn/object/cuda-cn.html

cuda检验执行时间:http://www.cnblogs.com/lopezycj/archive/2011/08/09/cuda_time.html

通过cuda的sdk学习makefile: http://www.cnblogs.com/FreeAquar/archive/2012/04/03/2430860.html

linux下命令行gdb调试命令:http://blog.csdn.net/dadalan/article/details/3758025



你可能感兴趣的:(cuda)