/* Add two Vectors A and B in C on GPU * Initialize A and B on host * 2 D Grid * 2 D thread block * each thread adds 1K consecutive elements * */ // Includes #include #include #include // Variables float* h_A; float* h_B; float* h_C; float* d_A; float* d_B; float* d_C; // Functions void Cleanup(bool); void ParseArguments(int, char**); void checkCUDAError(const char *msg); // Device code __global__ void AddVec(const float* A, const float* B, float* C, int N) { // do it } // Host code int main(int argc, char** argv) { int GrWdth; // Grid width int GrHght; // Grid height int BlWdth; // Block width int BlHght; // Block height int NVpThr; // number of values per thread if(argc != 6){ printf("Need 5 params: GrWdth GrHght BlWdth BlHght nVperThr"); exit(0); } else { sscanf(argv[1], "%d", &GrWdth); sscanf(argv[2], "%d", &GrHght); sscanf(argv[3], "%d", &BlWdth); sscanf(argv[4], "%d", &BlHght); sscanf(argv[5], "%d", &NVpThr); } printf("Allocate Vectors A,B,C, initialize A and B on host, device adds the vectors consecutively\n"); int N = GrWdth*GrHght*BlWdth*BlHght*NVpThr; // fit the grid / block space, # vals per thread size_t size = N * sizeof(float); // size in bytes dim3 dimGrid(GrWdth,GrHght); dim3 dimBlock(BlWdth,BlHght,1); cudaError_t error; // Allocate input vectors h_A and h_B in host memory h_A = (float*)malloc(size); if (h_A == 0) Cleanup(false); h_B = (float*)malloc(size); if (h_B == 0) Cleanup(false); h_C = (float*)malloc(size); if (h_C == 0) Cleanup(false); // Allocate vectors in device memory error = cudaMalloc((void**)&d_A, size); if (error != cudaSuccess) Cleanup(false); error = cudaMalloc((void**)&d_B, size); if (error != cudaSuccess) Cleanup(false); error = cudaMalloc((void**)&d_C, size); if (error != cudaSuccess) Cleanup(false); // Initialize h_A and h_B int i; for(i=0; i>>(d_A, d_B, d_C, N); error = cudaGetLastError(); if (error != cudaSuccess) Cleanup(false); // set up timer unsigned int timer = 0; cutCreateTimer( &timer); cutStartTimer( timer); // Invoke kernel AddVec<<>>(d_A, d_B, d_C, N); error = cudaGetLastError(); if (error != cudaSuccess) Cleanup(false); #ifdef _DEBUG error = cudaThreadSynchronize(); if (error != cudaSuccess) Cleanup(false); #endif cudaThreadSynchronize() ; cutStopTimer( timer); float time = cutGetTimerValue( timer); int nFlops = N; float nFlopsPerSec = 1e3*nFlops/time; float nGFlopsPerSec = nFlopsPerSec*1e-9; int nBytes = 3*4*N; // 2N words in, 1N word out float nBytesPerSec = 1e3*nBytes/time; float nGBytesPerSec = nBytesPerSec*1e-9; printf( "Time: %f (ms), GFlopsS: %f, GBytesS: %f\n", time, nGFlopsPerSec, nGBytesPerSec); // Copy result from device memory to host memory // h_C contains the result in host memory error = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); if (error != cudaSuccess) Cleanup(false); // Verify result for (i = 0; i < N; ++i) { float val = h_C[i]; if (fabs(val - N) > 1e-5) break; } printf("Test %s \n", (i == N) ? "PASSED" : "FAILED"); cutDeleteTimer( timer); Cleanup(true); } void Cleanup(bool noError) { // simplified version from CUDA SDK cudaError_t error; // Free device memories if (d_A) cudaFree(d_A); if (d_B) cudaFree(d_B); if (d_C) cudaFree(d_C); // Free host memory if (h_A) free(h_A); if (h_B) free(h_B); if (h_C) free(h_C); error = cudaThreadExit(); if (!noError || error != cudaSuccess) printf("cuda malloc or cuda thread exit failed \n"); fflush( stdout); fflush( stderr); exit(0); } void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); exit(-1); } }