// file: vecAdd.cu
#include
// GPU kernel
// __global__ 表示:GPU 代码入口函数
__global__ void vecAdd(int N, int* lhs, int* rhs, int* out) {
int ind = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前线程全局 ID
if (ind < N) // 避免多启动的线程出现内存越界
out[ind] = lhs[ind] + rhs[ind];
}
int main() {
// 初始化
const int N = 128;
int *h_lhs, *h_rhs, *h_out;
int *d_lhs, *d_rhs, *d_out;
h_lhs = (int*)malloc(N * sizeof(int));
h_rhs = (int*)malloc(N * sizeof(int));
h_out = (int*)malloc(N * sizeof(int));
memset(h_lhs, 0, N * sizeof(int));
for (int i = 0; i < N; ++i) h_rhs[i] = i;
// 分配内存
// 将修改指针的值,故为二级指针
cudaMalloc((void**)&d_lhs, N * sizeof(int));
cudaMalloc((void**)&d_rhs, N * sizeof(int));
cudaMalloc((void**)&d_out, N * sizeof(int));
// 数据拷贝至 GPU
cudaMemcpy(d_lhs, h_lhs, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs, h_rhs, N * sizeof(int), cudaMemcpyHostToDevice);
// launch kernel
// <<>> 表示launch configuration,
// 即 grid 内的 thread blocks 数,和 thread block 内的 threads 数;
// vecAdd<<<(N + 255) / 256, 256>>>(N, d_lhs, d_rhs, d_out);
dim3 grid_conf{(N + 255) / 256, 1, 1}, block_conf{256, 1, 1};
vecAdd<<<grid_conf, block_conf>>>(N, d_lhs, d_rhs, d_out);
// 打印可能的错误信息
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf(cudaGetErrorString(err));
// 结果拷贝回 CPU
cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) printf("%d\n", h_out[i]);
return 0;
}
编译方法:
nvcc vecAdd.cu -o shared_memo.exe
-arch
参数:nvcc vecAdd.cu -o vecAdd.exe -arch=sm_50 -Wno-deprecated-gpu-targets
#include
const int BLOCK_SIZE = 128, RADIUS = 5;
// GPU kernel
__global__ void conv_1d(int *in, int *out) {
// 单个 Block 共享内存里只有一个实例 shared,所有线程共享
__shared__ int shared[BLOCK_SIZE + 2 * RADIUS];
int global_ind =
blockIdx.x * blockDim.x + threadIdx.x; // 获取当前线程全局 ID
int local_ind = threadIdx.x + RADIUS;
shared[local_ind] = in[global_ind]; // 每个线程负责读取中心元素
// 部分线程负责读取 RADIUS 对应数据
if (threadIdx.x < RADIUS) {
shared[local_ind - RADIUS] = in[global_ind - RADIUS];
shared[local_ind + BLOCK_SIZE] = in[global_ind + BLOCK_SIZE];
}
// 需要等到 RADIUS 部分数据读完后,其它线程才能继续
__syncthreads(); // barrier,线程同步
// 计算卷积
int value = 0;
for (int offset = -RADIUS; offset <= RADIUS; ++offset)
value += shared[local_ind + offset];
out[global_ind] = value;
}
int main() {
// 初始化
const int N_VALID = 256;
const int N_TOTAL = N_VALID + 2 * RADIUS;
int *h_in, *h_out;
int *d_in, *d_out;
h_in = (int *)malloc(N_TOTAL * sizeof(int));
h_out = (int *)malloc(N_VALID * sizeof(int));
// memset(h_in, 10, N_TOTAL * sizeof(int));
// memset 逐字节赋值,只适用于 0 或 -1;
for (int i = 0; i < N_TOTAL; ++i) h_in[i] = 1;
// 分配内存
// 将修改指针的值,故为二级指针
cudaMalloc((void **)&d_in, N_TOTAL * sizeof(int));
cudaMalloc((void **)&d_out, N_VALID * sizeof(int));
// 数据拷贝至 GPU
cudaMemcpy(d_in, h_in, N_TOTAL * sizeof(int), cudaMemcpyHostToDevice);
// launch kernel
// <<>> 表示launch configuration,
// 即 grid 内的 thread blocks 数,和 thread block 内的 threads 数;
// 注意起始位置 d_in 偏移 RADIUS 个元素
conv_1d <<<(N_VALID + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>> (d_in + RADIUS, d_out);
// 打印可能的错误信息
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf(cudaGetErrorString(err));
// 结果拷贝回 CPU
cudaMemcpy(h_out, d_out, N_VALID * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N_VALID; ++i) printf("%d ", h_out[i]);
return 0;
}