cuda求矩阵每一行最大值

2、完成一个尺寸512*512的二维数组的每一行最大值的并行程序实现数据类型设置为float。需要完成4个版本。
(1) 不使用共享内存,只使用全局内存;采用具有分支发散的并行归约;

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define index 512
# define TILE_WIDTH 2

__global__ void  calcSum(float* AA, int Width)
{
	unsigned int tid = threadIdx.x;
	int Row = blockIdx.x * blockDim.x + threadIdx.x;
	for (unsigned int stride = 1;stride < blockDim.x; stride *= 2)
	{
		__syncthreads();
		if (tid % (2 * stride) == 0 && AA[Row + stride] > AA[Row])
				AA[Row] = AA[Row + stride];
	}
}

int main()
{
	cudaError_t cudaStatus = cudaSuccess;
	//初始化cpu矩阵
	int Ndim = 0, Pdim = 0, Width = 0;
	Ndim = Pdim = Width = index;
	int szA = Ndim * Pdim;
	float* A, * AA;
	A = (float*)malloc(szA * sizeof(float));
	int i;//初始化矩阵,可改为学号
	for (i = 0; i < szA; i++)
		A[i] = i+1;
	cudaStatus = cudaMalloc((void**)&AA, szA * sizeof(float));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc1 failed!");
	}
	cudaStatus = cudaMemcpy(AA, A, szA * sizeof(float), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy1 failed!");
	}
	dim3 dimGrid=index;
	dim3 dimBlock=index;
	calcSum << <dimGrid, dimBlock >> > (AA, Width);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "calcSum failed!");
		return 1;
	}
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}
	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(A, AA, szA * sizeof(float), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
	}
	//打印
	for (int i = 0; i < szA; i += Width)
		printf("The RoWmax is :%.1f\n", A[i]);
	printf("\nArray A:\n");
	for (i = 0; i < Ndim; i++) {
		for (int j = 0; j < Pdim; j++)
			printf("%.1f\t", A[i * Pdim + j]);
		printf("\n");
	}
	cudaFree(AA);
	free(A);
	return 0;
}

(2)不使用共享内存,只使用全局内存;采用无分支发散的并行归约;
注:与上题一样,只是核函数改变

__global__ void  calcSum(float* AA, int Width)
{
	unsigned int tid = threadIdx.x;
	int Row = blockIdx.x * blockDim.x + threadIdx.x;
	for (unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1)
	{
		__syncthreads();
		if (tid < stride&& AA[Row + stride] > AA[Row])
				AA[Row] = AA[Row + stride];
	}
}

(2) 使用共享内存;采用具有分支发散的并行归约;

3)	#include "cuda_runtime.h"4)	#include "device_launch_parameters.h"5)	#include <stdio.h>6)	#include <stdlib.h>7)	#define index 5128)	# define TILE_WIDTH 29)	
(10)	__global__ void  calcSum(float* AA, int Width)11{12)		__shared__ float middleware[index];//申请共享内存存放,数据不是很大情况下,不分块,可以直接存放每一个块的一行数据13)		unsigned int tid = threadIdx.x;14int Row = blockIdx.x * blockDim.x + threadIdx.x;15)		middleware[tid] = AA[Row];16for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)17{18__syncthreads();19if (tid % (2 * stride) == 0 && middleware[tid+ stride] > middleware[tid])20)				middleware[tid] = middleware[tid + stride];21}22if (tid == 0)AA[Row] = middleware[0];//最大值放在数组第一个元素中23}24)	
(25int main()26{27)		cudaError_t cudaStatus = cudaSuccess;28//初始化cpu矩阵29int Ndim = 0, Pdim = 0, Width = 0;30)		Ndim = Pdim = Width = index;31int szA = Ndim * Pdim;32float* A, * AA;33)		A = (float*)malloc(szA * sizeof(float));34int i;35**//初始化矩阵,可改为学号**36for (i = 0; i < szA; i++)37)			A[i] = i+1;38)		cudaStatus = cudaMalloc((void**)&AA, szA * sizeof(float));39if (cudaStatus != cudaSuccess) {40fprintf(stderr, "cudaMalloc1 failed!");41}42)		cudaStatus = cudaMemcpy(AA, A, szA * sizeof(float), cudaMemcpyHostToDevice);43if (cudaStatus != cudaSuccess) {44fprintf(stderr, "cudaMemcpy1 failed!");45}46)		dim3 dimGrid = index;47)		dim3 dimBlock = index;48)		calcSum << <dimGrid, dimBlock >> > (AA, Width);49if (cudaStatus != cudaSuccess) {50fprintf(stderr, "calcSum failed!");51return 1;52}53)		cudaStatus = cudaDeviceSynchronize();54if (cudaStatus != cudaSuccess) {55fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);56}57// Copy output vector from GPU buffer to host memory.58)		cudaStatus = cudaMemcpy(A, AA, szA * sizeof(float), cudaMemcpyDeviceToHost);59if (cudaStatus != cudaSuccess) {60fprintf(stderr, "cudaMemcpy failed!");61}62//打印63for (int i = 0; i < szA; i += Width)64printf("The RoWmax is :%.1f\n", A[i]);65printf("\nArray A:\n");66for (i = 0; i < Ndim; i++) {67for (int j = 0; j < Pdim; j++)68printf("%.1f\t", A[i * Pdim + j]);69printf("\n");70}71cudaFree(AA);72free(A);73return 0;74}

(4)使用共享内存,采用无分支发散的并行归约;
注:核函数改变,截图如上

__global__ void  calcSum(float* AA, int Width)
{
	__shared__ float middleware[index];//申请共享内存存放,数据不是很大情况下,不分块,可以直接存放每一个块的一行数据
	unsigned int tid = threadIdx.x;
	int Row = blockIdx.x * blockDim.x + threadIdx.x;
	middleware[tid] = AA[Row];
	for (unsigned int stride = blockDim.x; stride > 0; stride >>= 1)
	{
		__syncthreads();
		if (tid < stride && middleware[tid + stride] > middleware[tid])
			middleware[tid] = middleware[tid + stride];
	}
	if (tid == 0)AA[Row] = middleware[0];//最大值放在数组第一个元素中
}

测试16*16时结果是否正确:
cuda求矩阵每一行最大值_第1张图片

你可能感兴趣的:(cuda,gpu,cuda,并行计算)