Yun's Blog

from:http://claritycode.wordpress.com/2012/02/03/cudamallocpitch-and-cudamemcpy2d-example/

I think that cudaMallocPitch() and cudaMemcpy2D() do not have clear examples in CUDA documentation. I think the code below is a good starting point to understand what these functions do. I will write down more details to explain about them later on.

01#include<stdio.h>

02#include<stdlib.h>

03#include<assert.h>

04#define N 760 // side of matrix containing data

05#define PDIM 768 // padded dimensions

06#define TPB 128 //threads per block

07#define DIV 6

08 

09//load element from da to db to verify correct memcopy

10__global__ void kernel(float * da, float * db)

11{

12 int tid = blockDim.x * blockIdx.x + threadIdx.x;

13 if(tid%PDIM < N)

14 {

15 int row = blockIdx.x/DIV, col = blockIdx.x%DIV;

16 db[row*N + col*blockDim.x + threadIdx.x] = da[tid];

17 }

18}

19 

20void verify(float * A, float * B, int size);

21void init(float * array, int size);

22 

23int main(int argc, char * argv[])

24{

25 float * A, *dA, *B, *dB;

26 A = (float *)malloc(sizeof(float)*N*N);

27 B = (float *)malloc(sizeof(float)*N*N);

28 

29 init(A,N*N);

30 size_t pitch;

31 cudaMallocPitch(&dA, &pitch, sizeof(float)*N, N);

32 cudaMalloc(&dB, sizeof(float)*N*N);

33 

34//copy memory from unpadded array A of 760 by 760 dimensions

35//to more efficient dimensions of 768 by 768 on the device

36 cudaMemcpy2D(dA,pitch,A,sizeof(float)*N,sizeof(float)*N,N,cudaMemcpyHostToDevice);

37 int threadsperblock = TPB;

38 int blockspergrid = PDIM*PDIM/threadsperblock;

39 kernel<<<blockspergrid,threadsperblock>>>(dA,dB);

40 cudaMemcpy(B, dB, sizeof(float)*N*N, cudaMemcpyDeviceToHost);

41 //cudaMemcpy2D(B,N,dB,N,N,N,cudaMemcpyDeviceToHost);

42 verify(A,B,N*N);

43 

44 free(A);

45 free(B);

46 cudaFree(dA);

47 cudaFree(dB);

48}

49 

50void init(float * array, int size)

51{

52 for (int i = 0; i < size; i++)

53 {

54 array[i] = i;

55 }

56}

57 

58void verify(float * A, float * B, int size)

59{

60 for (int i = 0; i < size; i++)

61 {

62 assert(A[i]==B[i]);

63 }

64 printf("Correct!");

65}

Yun's Blog

Saturday, May 5, 2012

cudaMallocPitch() and cudaMemcpy2D() example