// Copied from the Cuda Programming Guide // main added, comments extended // Includes #include #include #include #define BLOCK_SIZE 16 #define epsilon (float)1e-4 #define verbose 0 // Matrices stored in row major order: M[row,col] = *(M.elements + row * M.stride + col) // A sub matrix takes the stride of the total matrix avoiding copies. // Matrix is really a MATRIX DESCRIPTOR typedef struct { int width; int height; int stride; float* elements; } Matrix; __device__ float GetElement(const Matrix A, int row, int col){ return A.elements[row*A.stride + col]; } __device__ void SetElement(Matrix A, int row, int col, float value){ A.elements[row*A.stride + col] = value; } // Get the BLOCK_SIZExBLOCK_SIZE sub MATRIX DESCRIPTOR Asub of A // located col sub matrices to the right and row submatrices down // from upper left corner of A // row and col are therefore BLOCK COORDINATES not indices of Block origins __device__ Matrix GetSubMatrix(Matrix A, int row, int col){ Matrix ASub; ASub.width = BLOCK_SIZE; ASub.height = BLOCK_SIZE; ASub.stride = A.stride; ASub.elements = &A.elements[A.stride*BLOCK_SIZE*row + BLOCK_SIZE*col]; return ASub; } __global__ void MatMulKernel(const Matrix, const Matrix, Matrix); // Host code, matrix dimensions must be multiples of BLOCK_SIZE void MatMul(const Matrix A, const Matrix B, Matrix C){ Matrix d_A; d_A.width = d_A.stride = A.width; d_A.height = A.height; size_t size = A.width * A.height * sizeof(float); cudaMalloc((void**) &d_A.elements, size); cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = d_B.stride = B.width; d_B.height = B.height; size = B.width * B.height * sizeof(float); cudaMalloc((void**) &d_B.elements, size); cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice); Matrix d_C; d_C.width = d_C.stride = C.width; d_C.height = C.height; size = C.width * C.height * sizeof(float); cudaMalloc((void**) &d_C.elements, size); dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); // Invoke kernel for warm up MatMulKernel<<>>(d_A, d_B, d_C); cudaThreadSynchronize() ; // set up timer unsigned int timer = 0; cutCreateTimer( &timer); cutStartTimer( timer); // Invoke kernel for real MatMulKernel<<>>(d_A, d_B, d_C); cudaThreadSynchronize() ; cutStopTimer( timer); float time = cutGetTimerValue( timer); float nFlops = (float)A.width*A.height*B.width*2; float nFlopsPerSec = 1e3*nFlops/time; float nGFlopsPerSec = nFlopsPerSec*1e-9; printf( "MM dimensions, A: %d,%d B: %d,%d C: %d,%d\nTime: %f (ms), nFlops: %f, GFlopsS: %f\n", A.height, A.width, B.height, B.width, C.height, C.width, time, nFlops, nGFlopsPerSec); cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); // Free device memory cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements); } __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){ // Block row and column coordinates int blockRow = blockIdx.y; int blockCol = blockIdx.x; // Each THREAD BLOCK computes one sub matrix Csub of C // EACH THREAD creates its own matrix descriptor Csub Matrix Csub = GetSubMatrix(C, blockRow, blockCol); // Each thread computes one element of Csub float Cvalue = 0; // thread row and col WITHIN CSUB int row = threadIdx.y; int col = threadIdx.x; // Loop over all sub matrices in blockRow of A and blockCol B // required to compute Csub // block multiply each pair of sub matrices and accumulate results for (int m = 0; m < (A.width / BLOCK_SIZE); ++m){ // Get Asub and Bsub descriptors Matrix Asub = GetSubMatrix(A, blockRow, m); Matrix Bsub = GetSubMatrix(B, m, blockCol); // Copy ELEMENTS OF ASub and Bsub into shared memory // EACH THREAD loads ONE ELEMENT of ASub and on of Bsub // Notice: it does not need to be the element it requires to // compute its Cvalue! As long as all elements are // collaboratively read. Ie, we can read in row major order and // exploit coalescing for both Asub and Bsub // NOTICE: every thread declares AS and Bs in shared memory // even though a thread block has only one As and one Bs __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; As[row][col] = GetElement(Asub, row, col); Bs[row][col] = GetElement(Bsub, row, col); // synchronize to ensure all elements are read __syncthreads(); // Do an inproduct of one row of Asub and one col of Bsub // computing one Cvalue for(int e=0; e epsilon*it){ errCnt++; double error = fabs(it - h_C.elements[y*h_C.width+x])/it; if (error > maxerror) maxerror = error; } } } if(errCnt>0){ printf("\n\nTEST FAILED: number of errors: %d, max rel error: %f\n", errCnt, maxerror); } }