/* Allocate Vector on GPU * Initialize on the device * 2 D Grid * 2 D thread block, each thread writes its treadIdx.x + blockDim.x+threadIdx.y into one vector element * */ // Includes #include #include // Variables float* h_V; // host ALWAYS float* d_V; // device bool noprompt = false; // Functions void Cleanup(bool); void ParseArguments(int, char**); // Device code __global__ void TouchVec(float* V, int N) { // do it } // Host code int main(int argc, char** argv) { printf("Allocate Vector, initialize it in GPU, grid 2D, block 2D \n"); int N = 4*2*16*8; // fit the grid / block space size_t size = N * sizeof(float); // size in bytes dim3 dimGrid(4,2); // GridDx=4 width, GridDy=2 height dim3 dimBlock(16,8,1); // BlockDx=16 width, BlockDy = 8 height cudaError_t error; // Allocate input vectors h_A and h_B in host memory h_V = (float*)malloc(size); if (h_V == 0) Cleanup(false); // Allocate vectors in device memory error = cudaMalloc((void**)&d_V, size); if (error != cudaSuccess) Cleanup(false); // Invoke kernel TouchVec<<>>(d_V, N); error = cudaGetLastError(); if (error != cudaSuccess) Cleanup(false); #ifdef _DEBUG error = cudaThreadSynchronize(); if (error != cudaSuccess) Cleanup(false); #endif // Copy result from device memory to host memory // h_C contains the result in host memory error = cudaMemcpy(h_V, d_V, size, cudaMemcpyDeviceToHost); if (error != cudaSuccess) Cleanup(false); // Verify result int i; for (i = 0; i < N; ++i) { float val = h_V[i]; if (fabs(val - i) > 1e-5) break; } printf("Test %s \n", (i == N) ? "PASSED" : "FAILED"); // Print Corner Values of first thread block int blockSize = dimBlock.x * dimBlock.y; // # threads in one threadBlock int gridWidth = dimGrid.x; // # blocks in grid row int nThrPerGrRow = gridWidth * blockSize; // # threads per grid row // Corners block 0 0 int blkIdxX = 0; int blkIdxY = 0; int thrIdxX = 0; int thrIdxY = 0; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; float V00 = h_V[i]; thrIdxX = 15; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; float V150 = h_V[i]; thrIdxX = 0; thrIdxY = 7; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; float V07 = h_V[i]; printf("Corner values in thread block Bx=0 By=0\n"); printf("(Tx=0,Ty0) = %f, G(Tx=15, Ty=0) = %f, G(Tx=0,Ty=7)(0) = %f\n", V00, V150, V07); // Corners block 3 0 blkIdxX = 3; thrIdxY = 0; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V00 = h_V[i]; thrIdxX = 15; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V150 = h_V[i]; thrIdxY = 7; thrIdxX = 0; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V07 = h_V[i]; printf("Corner values in thread block Bx=3 By=0\n"); printf("G(Tx=0,Ty=0) = %f, G(Tx=15,Ty=0) = %f, G(Tx=0,Ty=7) = %f\n", V00, V150, V07); // Corners block 0 1 blkIdxX = 0; blkIdxY = 1; thrIdxY = 0; thrIdxX = 0; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V00 = h_V[i]; thrIdxX = 15; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V150 = h_V[i]; thrIdxY = 7; thrIdxX = 0; i = blkIdxY*nThrPerGrRow + blkIdxX*blockSize + thrIdxY * dimBlock.x + thrIdxX; V07 = h_V[i]; printf("Corner values in thread block Bx=0 By=1\n"); printf("G(Tx=0,Ty=0) = %f, G(Tx=15,Ty=0) = %f, G(Tx=0,Ty=7) = %f\n", V00, V150, V07); Cleanup(true); } void Cleanup(bool noError) { // simplified version from CUDA SDK cudaError_t error; // Free device memory if (d_V) cudaFree(d_V); // Free host memory if (h_V) free(h_V); error = cudaThreadExit(); if (!noError || error != cudaSuccess) printf("cuda malloc or cuda thread exit failed \n"); fflush( stdout); fflush( stderr); exit(0); }