/* Add two Vectors A and B in C on GPU * Initialize A and B on host * 2 D Grid * 2 D thread block * each thread adds 1K consecutive elements * */ // Includes #include #include #include // Variables float* h_A; float* h_B; float* h_C; float* h_G; float* d_A; float* d_B; float* d_C; // Functions void Cleanup(bool); void ParseArguments(int, char**); // Device code __global__ void MulVec(const float* A, const float* B, float* C, int nVPT) { int blockStartIndex = blockIdx.x*blockDim.x * nVPT; int thrStartIndex = blockStartIndex + threadIdx.x; int endIndex = blockStartIndex+nVPT*blockDim.x; // first index out int i; for( i=thrStartIndex; i < endIndex; i += blockDim.x){ C[i] = A[i]*B[i]; } } // Host code int main(int argc, char** argv) { int GrWdth; // Grid width int BlWdth; // Block width int NVpThr; // number of values per thread if(argc < 4){ printf("Need 3 params: GrWdth BlWdth nVperThr"); exit(0); } else { sscanf(argv[1], "%d", &GrWdth); sscanf(argv[2], "%d", &BlWdth); sscanf(argv[3], "%d", &NVpThr); } int N = GrWdth*BlWdth*NVpThr; // fit the grid / block space, # vals per thread size_t size = N * sizeof(float); // size in bytes dim3 dimGrid(GrWdth); dim3 dimBlock(BlWdth); cudaError_t error; // Allocate input vectors h_A and h_B in host memory h_A = (float*)malloc(size); if (h_A == 0) Cleanup(false); h_B = (float*)malloc(size); if (h_B == 0) Cleanup(false); h_C = (float*)malloc(size); if (h_C == 0) Cleanup(false); h_G = (float*)malloc(size); if (h_G == 0) Cleanup(false); // Allocate vectors in device memory error = cudaMalloc((void**)&d_A, size); if (error != cudaSuccess) Cleanup(false); error = cudaMalloc((void**)&d_B, size); if (error != cudaSuccess) Cleanup(false); error = cudaMalloc((void**)&d_C, size); if (error != cudaSuccess) Cleanup(false); // Initialize h_X int i; for(i=0; i>>(d_A, d_B, d_C, NVpThr); error = cudaGetLastError(); // if (error != cudaSuccess) Cleanup(false); // set up timer unsigned int timer = 0; cutCreateTimer( &timer); cutStartTimer( timer); // Invoke kernel MulVec<<>>(d_A, d_B, d_C, NVpThr); error = cudaGetLastError(); if (error != cudaSuccess) Cleanup(false); cudaThreadSynchronize() ; cutStopTimer( timer); float time = cutGetTimerValue( timer); int nFlops = N; float nFlopsPerSec = 1e3*nFlops/time; float nGFlopsPerSec = nFlopsPerSec*1e-9; int nBytes = 3*4*N; // 2N words in, 1N word out float nBytesPerSec = 1e3*nBytes/time; float nGBytesPerSec = nBytesPerSec*1e-9; printf( "Time: %f (ms), GFlopsS: %f, GBytesS: %f\n", time, nGFlopsPerSec, nGBytesPerSec); // Copy result from device memory to host memory // h_C contains the result in host memory error = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); if (error != cudaSuccess) Cleanup(false); // Verify result for (i = 0; i < N; ++i) { float val = h_C[i]; if (fabs(val) - h_G[i] > 1e-5*h_G[i]) break; } printf("Test %s \n", (i == N) ? "PASSED" : "FAILED"); cutDeleteTimer( timer); Cleanup(true); } void Cleanup(bool noError) { // simplified version from CUDA SDK cudaError_t error; // Free device memories if (d_A) cudaFree(d_A); if (d_B) cudaFree(d_B); if (d_C) cudaFree(d_C); // Free host memory if (h_A) free(h_A); if (h_B) free(h_B); if (h_C) free(h_C); error = cudaThreadExit(); if (!noError || error != cudaSuccess) printf("cuda malloc or cuda thread exit failed \n"); fflush( stdout); fflush( stderr); exit(0); }