I think that cudaMallocPitch() and cudaMemcpy2D() do not have clear examples in CUDA documentation. I think the code below is a good starting point to understand what these functions do. I will write down more details to explain about them later on.
04 | #define N 760 // side of matrix containing data |
05 | #define PDIM 768 // padded dimensions |
06 | #define TPB 128 //threads per block |
10 | __global__ void kernel( float * da, float * db) |
12 | int tid = blockDim.x * blockIdx.x + threadIdx.x; |
15 | int row = blockIdx.x/DIV, col = blockIdx.x%DIV; |
16 | db[row*N + col*blockDim.x + threadIdx.x] = da[tid]; |
20 | void verify( float * A, float * B, int size); |
21 | void init( float * array, int size); |
23 | int main( int argc, char * argv[]) |
25 | float * A, *dA, *B, *dB; |
26 | A = ( float *) malloc ( sizeof ( float )*N*N); |
27 | B = ( float *) malloc ( sizeof ( float )*N*N); |
31 | cudaMallocPitch(&dA, &pitch, sizeof ( float )*N, N); |
32 | cudaMalloc(&dB, sizeof ( float )*N*N); |
36 | cudaMemcpy2D(dA,pitch,A, sizeof ( float )*N, sizeof ( float )*N,N,cudaMemcpyHostToDevice); |
37 | int threadsperblock = TPB; |
38 | int blockspergrid = PDIM*PDIM/threadsperblock; |
39 | kernel<<<blockspergrid,threadsperblock>>>(dA,dB); |
40 | cudaMemcpy(B, dB, sizeof ( float )*N*N, cudaMemcpyDeviceToHost); |
50 | void init( float * array, int size) |
52 | for ( int i = 0; i < size; i++) |
58 | void verify( float * A, float * B, int size) |
60 | for ( int i = 0; i < size; i++) |