// Copied from the Cuda Programming Guide // main added // Includes #include #include #include #define BLOCK_SIZE 2 #define epsilon (float)1e-5 #define verbose 1 // Matrices stored in row major order: M[row,col] = *(M.elements + row * M.width + col) typedef struct { int width; int height; float* elements; } Matrix; __global__ void MatMulKernel(const Matrix, const Matrix, Matrix); // Host code, matrix dimensions must be multiples of BLOCK_SIZE void MatMul(const Matrix A, const Matrix B, Matrix C){ Matrix d_A; d_A.width = A.width; d_A.height = A.height; size_t size = A.width * A.height * sizeof(float); cudaMalloc((void**) &d_A.elements, size); cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = B.width; d_B.height = B.height; size = B.width * B.height * sizeof(float); cudaMalloc((void**) &d_B.elements, size); cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice); Matrix d_C; d_C.width = C.width; d_C.height = C.height; size = C.width * C.height * sizeof(float); cudaMalloc((void**) &d_C.elements, size); // Invoke kernel dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); MatMulKernel<<>>(d_A, d_B, d_C); cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); // Free device memory cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements); } __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){ // each thread computes one element of C float Cvalue = 0; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; for (int e = 0; e < A.width; ++e) Cvalue += A.elements[row*A.width + e] * B.elements[e*B.width + col]; C.elements[row*C.width + col] = Cvalue; } // main int main(int argc, char** argv) { Matrix h_A; h_A.width = 3*BLOCK_SIZE; h_A.height = 2*BLOCK_SIZE; size_t size = h_A.width*h_A.height*sizeof(float); h_A.elements = (float*)malloc(size); Matrix h_B; h_B.width = 4*BLOCK_SIZE; h_B.height = 3*BLOCK_SIZE; size = h_B.width*h_B.height*sizeof(float); h_B.elements = (float*)malloc(size); Matrix h_C; h_C.width = 4*BLOCK_SIZE; h_C.height = 2*BLOCK_SIZE; size = h_C.width*h_C.height*sizeof(float); h_C.elements = (float*)malloc(size); Matrix gold_C; gold_C.width = 4*BLOCK_SIZE; gold_C.height = 2*BLOCK_SIZE; gold_C.elements = (float*)malloc(size); for(int y=0; y epsilon*it) errCnt++; } } if(errCnt==0) printf("\nTEST PASSED\n"); else printf("\n\nTEST FAILED: number of errors: %d\n", errCnt); }