cd ~/examples/cuda/
wget http://www.es.ele.tue.nl/~mwijtvliet/5KK73/downloads/cuda.zip -O cuda.zip
unzip ./cuda.zip
cd cuda
make
./
matrixmul
void main(){
define A, B, C
for i = 0 to M do
for j = 0 to N do
/* compute element C(i,j)
*/
for k = 0 to K do
C(i,j) <= C(i,j) + A(i,k) * B(k,j)
end
end
end
}
/* Codes running on CPU */
void main(){
define A_cpu, B_cpu, C_cpu in the CPU memory
define A_gpu, B_gpu, C_gpu in the GPU memory
memcopy A_cpu to A_gpu
memcopy B_cpu to B_gpu
dim3 dimBlock(16, 16)
dim3 dimGrid(N/dimBlock.x, M/dimBlock.y)
matrixMul<<>>(A_gpu,B_gpu,C_gpu,K)
memcopy C_gpu to C_cpu
}
/* Codes running on GPU */
__global__ void matrixMul(A_gpu,B_gpu,C_gpu,K){
temp <= 0
i <= blockIdx.y * blockDim.y + threadIdx.y
// Row i of matrix C
j <= blockIdx.x * blockDim.x + threadIdx.x
// Column j of matrix C
for k = 0 to K-1 do
accu
<= accu
+ A_gpu(i,k) * B_gpu(k,j)
end
C_gpu(i,j) <= accu
}
/* Codes running on GPU */
__global__ void matrixMul(A_gpu,B_gpu,C_gpu,K){
__shared__ float A_tile(blockDim.y, blockDim.x)
__shared__ float B_tile(blockDim.x, blockDim.y)
accu
<= 0
/* Accumulate C tile by tile.
*/
for tileIdx = 0 to (K/blockDim.x - 1) do
/* Load
one tile of A and one tile of B into shared mem */
// Row i of matrix A
i <=
blockIdx.y
* blockDim.y + threadIdx.y
// Column j of matrix A
j <=
tileIdx
* blockDim.x + threadIdx.x
// Load A(i,j) to shared mem
A_tile(threadIdx.y, threadIdx.x)
<= A_gpu(i,j)
// Load B(j,i) to shared mem
B_tile(threadIdx.x, threadIdx.y)
<=
B_gpu(j,i)
// Global Mem Not coalesced
// Synchronize before computation
__sync()
/* Accumulate one tile of C
from tiles of A and B in shared mem */
for k = 0 to threadDim.x do
// Accumulate for matrix C
accu
<= accu
+ A_tile(threadIdx.y,k) * B_tile(k,threadIdx.x)
end
//
Synchronize
__sync()
end
// Row i of matrix C
i <= blockIdx.y * blockDim.y + threadIdx.y
// Column j of matrix C
j <= blockIdx.x * blockDim.x + threadIdx.x
// Store accumulated value to C(i,j)
C_gpu(i,j)
<= accu
}
/* Codes running on GPU */
__global__ void matrixMul(A_gpu,B_gpu,C_gpu,K){
__shared__ float A_tile(blockDim.y, blockDim.x)
__shared__ float B_tile(blockDim.x, blockDim.y)
accu
<= 0
/* Accumulate C tile by tile.
*/
for tileIdx = 0 to (K/blockDim.x - 1) do
/* Load
one tile of A and one tile of B into shared mem */
// Row i of matrix A
i <=
blockIdx.y
* blockDim.y + threadIdx.y
// Column j of matrix A
j <=
tileIdx
* blockDim.x + threadIdx.x
// Load A(i,j) to shared mem
A_tile(threadIdx.y, threadIdx.x)
<= A_gpu(i,j)
// Load B(i,j) to shared mem
B_tile(threadIdx.x, threadIdx.y)
<=
B_gpu(i,j)
// Global Mem Coalesced
// Synchronize before computation
__sync()
/* Accumulate one tile of C
from tiles of A and B in shared mem */
for k = 0 to threadDim.x do
// Accumulate for matrix C
// Shared Mem Bank conflict
accu
<= accu
+ A_tile(threadIdx.y,k) *
B_tile(threadIdx.x,k)
end
//
Synchronize
__sync()
end
// Row i of matrix C
i <= blockIdx.y * blockDim.y + threadIdx.y
// Column j of matrix C
j <= blockIdx.x * blockDim.x + threadIdx.x
// Store accumulated value to C(i,j)
C_gpu(i,j)
<= accu
}
/* Codes running on GPU */
__global__ void matrixMul(A_gpu,B_gpu,C_gpu,K){
__shared__ float A_tile(blockDim.y, blockDim.x)
__shared__ float B_tile(blockDim.x, blockDim.y)
accu
<= 0
/* Accumulate C tile by tile.
*/
for tileIdx = 0 to (K/blockDim.x - 1) do
/* Load
one tile of A and one tile of B into shared mem */
// Row i of matrix A
i <=
blockIdx.y
* blockDim.y + threadIdx.y
// Column j of matrix A
j <=
tileIdx
* blockDim.x + threadIdx.x
// Load A(i,j) to shared mem
A_tile(threadIdx.y, threadIdx.x)
<= A_gpu(i,j)
// Load B(i,j) to shared mem
B_tile(threadIdx.y, threadIdx.x)
<= B_gpu(i,j)
// No Shared Mem Bank conflict
// Synchronize before computation
__sync()
/* Accumulate one tile of C
from tiles of A and B in shared mem */
for k = 0 to threadDim.x do
// Accumulate for matrix C
// No Shared Mem Bank conflict
accu
<= accu
+ A_tile(threadIdx.y,k) *
B_tile(k,
threadIdx.x)
end
//
Synchronize
__sync()
end
// Row i of matrix C
i <= blockIdx.y * blockDim.y + threadIdx.y
// Column j of matrix C
j <= blockIdx.x * blockDim.x + threadIdx.x
// Store accumulated value to C(i,j)
C_gpu(i,j)
<= accu
}
/* CUDA code for inner product */
accu
<= accu
+ A_tile(threadIdx.y,k) *
B_tile(k, threadIdx.x)
/* Disassembled from cubin binary */
mov.b32 $r0, s[$ofs4+0x0000]
mad.rn.f32 $r9, s[$ofs1+0x002c], $r0, $r9
/* CUDA code for outer product */
/* accu[i] and b are stored in register file */
accu[i]
<= accu
[i] + A_tile(i) * b
/* Disassembled from cubin binary */
mad.rn.f32 $r9, s[$ofs2+0x0010], $r29, $r9
Each thread stores one element of B0,0 in its register. Each thread also stores one column of C0,0 in its register.
Iteration 1: outer product between the first column of A0,0 and the first row of B0,0, and update C0,0.#pragma unroll
/* Codes running on GPU */
__global__ void matrixMul(A_gpu,B_gpu,C_gpu,K){
__shared__ float A_tile0(blockDim.y, blockDim.x)
__shared__ float A_tile1(blockDim.x, blockDim.y)
float *pointer0 =
A_tile0
float *pointer1 =
A_tile1
fetch one tile of matrix A_gpu
to pointer0
__sync()
/* Accumulate C tile by tile.
*/
for tileIdx = 0 to (K/blockDim.x - 1) do
prefetch
one tile of matrix A_gpu
to pointer1
accumulate C using pointer0
__sync()
swap pointer0 and pointer1
end
store tile C to global memory
}