Saturday, May 5, 2012

cudaMallocPitch() and cudaMemcpy2D() example


I think that cudaMallocPitch() and cudaMemcpy2D() do not have clear examples in CUDA documentation. I think the code below is a good starting point to understand what these functions do. I will write down more details to explain about them later on.
01#include<stdio.h>
02#include<stdlib.h>
03#include<assert.h>
04#define N 760 // side of matrix containing data
05#define PDIM 768 // padded dimensions
06#define TPB 128 //threads per block
07#define DIV 6
08 
09//load element from da to db to verify correct memcopy
10__global__ void kernel(float * da, float * db)
11{
12 int tid = blockDim.x * blockIdx.x + threadIdx.x;
13 if(tid%PDIM < N)
14 {
15 int row = blockIdx.x/DIV, col = blockIdx.x%DIV;
16 db[row*N + col*blockDim.x + threadIdx.x] = da[tid];
17 }
18}
19 
20void verify(float * A, float * B, int size);
21void init(float * array, int size);
22 
23int main(int argc, char * argv[])
24{
25 float * A, *dA, *B, *dB;
26 A = (float *)malloc(sizeof(float)*N*N);
27 B = (float *)malloc(sizeof(float)*N*N);
28 
29 init(A,N*N);
30 size_t pitch;
31 cudaMallocPitch(&dA, &pitch, sizeof(float)*N, N);
32 cudaMalloc(&dB, sizeof(float)*N*N);
33 
34//copy memory from unpadded array A of 760 by 760 dimensions
35//to more efficient dimensions of 768 by 768 on the device
36 cudaMemcpy2D(dA,pitch,A,sizeof(float)*N,sizeof(float)*N,N,cudaMemcpyHostToDevice);
37 int threadsperblock = TPB;
38 int blockspergrid = PDIM*PDIM/threadsperblock;
39 kernel<<<blockspergrid,threadsperblock>>>(dA,dB);
40 cudaMemcpy(B, dB, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
41 //cudaMemcpy2D(B,N,dB,N,N,N,cudaMemcpyDeviceToHost);
42 verify(A,B,N*N);
43 
44 free(A);
45 free(B);
46 cudaFree(dA);
47 cudaFree(dB);
48}
49 
50void init(float * array, int size)
51{
52 for (int i = 0; i < size; i++)
53 {
54 array[i] = i;
55 }
56}
57 
58void verify(float * A, float * B, int size)
59{
60 for (int i = 0; i < size; i++)
61 {
62 assert(A[i]==B[i]);
63 }
64 printf("Correct!");
65}