/// /// matmult.cu /// For CSU CS575 Spring 2011 /// Instructor: Wim Bohm /// Based on code from the CUDA Programming Guide /// Modified by Wim Bohm and David Newman /// Created: 2011-01-27 /// Last Modified: 2011-02-19 DVN /// /// Do not modify this file. The GTA will grade your /// code using the master copy of this file, not your /// copy, so any modifications you make will not play /// a role in the grading. /// // Includes #include #include #include "matmultKernel.h" // Defines #define epsilon (float)1e-4 #define verbose 0 Matrix MakeDeviceMatrix(Matrix M, bool copy){ // Create a new matrix in device memory. Matrix newDeviceMatrix; newDeviceMatrix.width = M.width; newDeviceMatrix.stride = M.width; newDeviceMatrix.height = M.height; size_t size = M.width * M.height * sizeof(float); cudaMalloc((void**) &newDeviceMatrix.elements, size); if (copy) cudaMemcpy(newDeviceMatrix.elements, M.elements, size, cudaMemcpyHostToDevice); return newDeviceMatrix; } // Host code for matrix multiplication. // Matrix dimensions must be multiples of size // This code assumes that the matrix is square. void MatMul(const Matrix A, const Matrix B, Matrix C, int dimension){ // Create device data structures. Matrix device_A = MakeDeviceMatrix(A, true); Matrix device_B = MakeDeviceMatrix(B, true); Matrix device_C = MakeDeviceMatrix(C, false); // Define grid topology dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 dimGrid(B.width/dimension, A.height/dimension); // Invoke kernel for warm up MatMulKernel<<>>(device_A, device_B, device_C); // Synchronize to make sure everyone is done in the warmup. cudaThreadSynchronize(); // Set up timer StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); sdkStartTimer(&timer); // Invoke kernel for reals MatMulKernel<<>>(device_A, device_B, device_C); // Synchronize to make sure everyone is done. cudaThreadSynchronize() ; // Compute and report the timing results sdkStopTimer(&timer); float time = sdkGetTimerValue(&timer); float nFlops = (float)A.width*A.height*B.width*2; float nFlopsPerSec = 1e3*nFlops/time; float nGFlopsPerSec = nFlopsPerSec*1e-9; printf( "Data dimensions: %dx%d \n", C.height, C.width); printf( "Grid Dimensions: %dx%d \n",dimGrid.x,dimGrid.y); printf( "Block Dimensions: %dx%d \n",dimBlock.x,dimBlock.y); printf( "Footprint Dimensions: %dx%d \n",FOOTPRINT_SIZE,FOOTPRINT_SIZE); printf( "Time: %f (ms), nFlops: %0.0f, GFlopsS: %f\n", time, nFlops, nGFlopsPerSec); // Copy the result to the host memory from device memory size_t size = C.width * C.height * sizeof(float); cudaMemcpy(C.elements, device_C.elements, size, cudaMemcpyDeviceToHost); // Free device memory cudaFree(device_A.elements); cudaFree(device_B.elements); cudaFree(device_C.elements); sdkDeleteTimer(&timer); } // Create a matrix in host memory. Matrix MakeHostMatrix(int width, int height){ Matrix newHostMatrix; newHostMatrix.width = width; newHostMatrix.height = height; size_t size = newHostMatrix.width * newHostMatrix.height * sizeof(float); newHostMatrix.elements = (float*)malloc(size); return newHostMatrix; } // Print a matrix stored in host memory. void printMatrix(Matrix M, char* name) { printf("\n%s \n",name); for(int y=0; y epsilon*it) { errCnt++; double error = fabs(it - M.elements[y*M.width+x])/it; if (error > maxerror) maxerror = error; } } } if(errCnt>0){ printf("\n\nTEST FAILED: number of errors: %d, max rel error: %f\n", errCnt, maxerror); } free(correct.elements); } // // main // int main(int argc, char** argv) { // Grid dimension int num_blocks; // Matrix dimensions in multiples of FOOTPRINT_SIZE // Matrices will be of size data_size * data_size int data_size; // Read command line argument if(argc == 2){ sscanf(argv[1], "%d", &num_blocks); data_size = num_blocks * FOOTPRINT_SIZE; } else { printf("Usage: %s NumBlocks\n", argv[0]); exit(0); } // Create matrices in host. Matrix host_A = MakeHostMatrix(data_size, data_size); Matrix host_B = MakeHostMatrix(data_size, data_size); Matrix host_C = MakeHostMatrix(data_size, data_size); // Initialize values in host A and B initMatrix(host_A,false); initMatrix(host_B,true); // debugging if(verbose){ printMatrix(host_A, "host_A"); printMatrix(host_B, "host_B"); } // Perform CUDA matrix Multiplication // MatMul is a host function that calls // the device kernel MatMulKernel and // times its performance. MatMul(host_A,host_B,host_C,FOOTPRINT_SIZE); // Verify that the result is correct. checkResult(host_C); // Free allocated memory. free(host_A.elements); free(host_B.elements); free(host_C.elements); }