// Copied from the Cuda Programming Guide
// main added, comments extended

// Includes
#include <stdio.h>
#include <assert.h>
#include <cutil.h>
#define BLOCK_SIZE 16
#define epsilon (float)1e-4
#define verbose 0

// Matrices stored in row major order: M[row,col] = *(M.elements + row * M.stride + col)
// A sub matrix takes the stride of the total matrix avoiding copies.
// Matrix is really a MATRIX DESCRIPTOR
typedef struct {
  int width;
  int height;
  int stride;
  float* elements;
} Matrix;


__device__ float GetElement(const Matrix A, int row, int col){
  return A.elements[row*A.stride + col];
}

__device__ void SetElement(Matrix A, int row, int col, float value){
  A.elements[row*A.stride + col] = value;
}

// Get the BLOCK_SIZExBLOCK_SIZE sub MATRIX DESCRIPTOR Asub of A
// located col sub matrices to the right and row submatrices down
// from upper left corner of A
// row and col are therefore BLOCK COORDINATES not indices of Block origins
__device__ Matrix GetSubMatrix(Matrix A, int row, int col){
  Matrix ASub;
  ASub.width  = BLOCK_SIZE;
  ASub.height = BLOCK_SIZE;
  ASub.stride = A.stride;
  ASub.elements = &A.elements[A.stride*BLOCK_SIZE*row + BLOCK_SIZE*col];
  return ASub;
}

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Host code, matrix dimensions must be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C){
  Matrix d_A; 
  d_A.width = d_A.stride = A.width; d_A.height = A.height;
  size_t size = A.width * A.height * sizeof(float);
  cudaMalloc((void**) &d_A.elements, size);
  cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

  Matrix d_B;
  d_B.width = d_B.stride = B.width; d_B.height = B.height;
  size = B.width * B.height * sizeof(float);
  cudaMalloc((void**) &d_B.elements, size);
  cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);

  Matrix d_C;
  d_C.width = d_C.stride = C.width; d_C.height = C.height;
  size = C.width * C.height * sizeof(float);
  cudaMalloc((void**) &d_C.elements, size);

  dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
  dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

  // Invoke kernel for warm up
  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
  cudaThreadSynchronize() ;

  // set up timer
  unsigned int timer = 0;
  cutCreateTimer( &timer);
  cutStartTimer( timer);

  // Invoke kernel for real
  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
 
  cudaThreadSynchronize() ;
  cutStopTimer( timer);
  float time = cutGetTimerValue( timer);
  float nFlops = (float)A.width*A.height*B.width*2;
  float nFlopsPerSec = 1e3*nFlops/time;
  float nGFlopsPerSec = nFlopsPerSec*1e-9;
  printf( "MM dimensions, A: %d,%d  B: %d,%d  C: %d,%d\nTime: %f (ms), nFlops: %f, GFlopsS: %f\n",
                          A.height, A.width, B.height, B.width, C.height, C.width, time, nFlops, nGFlopsPerSec);

  cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);

  // Free device memory
  cudaFree(d_A.elements);
  cudaFree(d_B.elements);
  cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
  // Block row and column coordinates
  int blockRow = blockIdx.y;
  int blockCol = blockIdx.x;

  // Each THREAD BLOCK computes one sub matrix Csub of C
  // EACH THREAD creates its own matrix descriptor Csub
  Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

  // Each thread computes one element of Csub
  float Cvalue = 0;

  // thread row and col WITHIN CSUB
  int row = threadIdx.y;
  int col = threadIdx.x;

  // Loop over all sub matrices in blockRow of A and blockCol B
  // required to compute Csub
  // block multiply each pair of sub matrices and accumulate results
  for (int m = 0;  m < (A.width / BLOCK_SIZE); ++m){
  // Get Asub and Bsub descriptors
  Matrix Asub = GetSubMatrix(A, blockRow, m);
  Matrix Bsub = GetSubMatrix(B, m, blockCol);

  // Copy ELEMENTS OF  ASub and Bsub into shared memory
  // EACH THREAD loads ONE ELEMENT of ASub and on of Bsub
  // Notice: it does not need to be the element it requires to
  //         compute its Cvalue! As long as all elements are 
  //         collaboratively read. Ie, we can read in row major order and
  //         exploit coalescing for both Asub and Bsub

  // NOTICE: every thread declares AS and Bs in shared memory
  //         even though a thread block has only one As and one Bs
  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

  As[row][col] = GetElement(Asub, row, col);
  Bs[row][col] = GetElement(Bsub, row, col);

  // synchronize to ensure all elements are read
  __syncthreads();

  // Do an inproduct of one row of Asub and one col of Bsub
  // computing one Cvalue
  for(int e=0; e<BLOCK_SIZE; ++e)
     Cvalue += As[row][e] * Bs[e][col];

  // Synchronize to ensure all Cvalues have been incremented
  // before reading in the next As AND Bs BLOCKS

  __syncthreads();
  }

  // write Csub to GLOBAL memory
  // each thread writes its own 
  SetElement(Csub, row , col, Cvalue);
}

// main

int main(int argc, char** argv) {

  int AW, AH, BW;
      
  if(argc == 2){
    sscanf(argv[1], "%d", &AH);
    AW = BW = AH;
  } else if(argc == 4) {
    sscanf(argv[1], "%d", &AH);
    sscanf(argv[2], "%d", &AW);
    sscanf(argv[3], "%d", &BW);
  } else {
     printf("Need 1 #Blocks param for square matMult\n  or 3 params: #AHeightBlocks #AWidthBlocks #BWidthBlocks\n");
     exit(0);
  }     

  Matrix h_A;
  h_A.width = AW*BLOCK_SIZE;  h_A.height = AH*BLOCK_SIZE;
  size_t size = h_A.width*h_A.height*sizeof(float);
  h_A.elements = (float*)malloc(size);

  Matrix h_B;
  h_B.width = BW*BLOCK_SIZE; h_B.height = AW*BLOCK_SIZE;
  size = h_B.width*h_B.height*sizeof(float);
  h_B.elements = (float*)malloc(size);


  Matrix h_C;
  h_C.width = BW*BLOCK_SIZE; h_C.height = AH*BLOCK_SIZE;
  size = h_C.width*h_C.height*sizeof(float);
  h_C.elements = (float*)malloc(size);


  Matrix gold_C;
  gold_C.width = BW*BLOCK_SIZE; gold_C.height = AH*BLOCK_SIZE;
  gold_C.elements = (float*)malloc(size);

  for(int y=0; y<h_A.height; y++)
   for(int x=0; x<h_A.width; x++) {
      h_A.elements[y*h_A.width+x] = (float)y;
   }
 
  if(verbose){
  // print h_A
  printf("\nh_A\n");
  for(int y=0; y<h_A.height; y++){
   printf("\n");
   for(int x=0; x<h_A.width; x++) {
      printf("%f ", h_A.elements[y*h_A.width+x]);
   }
  }
  }

  for(int y=0; y<h_B.height; y++)
   for(int x=0; x<h_B.width; x++) {
      h_B.elements[y*h_B.width+x] = (float)x;
   }
 
  if(verbose){
   // print h_B
   printf("\nh_B\n");
   for(int y=0; y<h_B.height; y++){
    printf("\n");
    for(int x=0; x<h_B.width; x++) {
       printf("%f ", h_B.elements[y*h_B.width+x]);
    }
   }
  }

  
  MatMul(h_A,h_B,h_C);

  for(int y=0; y<h_A.height; y++)
   for(int x=0; x<h_B.width; x++) {
       gold_C.elements[y*gold_C.width+x] = (float)h_A.width*(float)x*y;
  }

  if(verbose){
   // print gold_C
   printf("\n\ngold_C");
   for(int y=0; y<gold_C.height; y++){
    printf("\n");
    for(int x=0; x<gold_C.width; x++) {
       printf("%f ", gold_C.elements[y*gold_C.width+x]);
    }
   }

   // print h_C
   printf("\n\n\nh_C");
   for(int y=0; y<h_C.height; y++){
    printf("\n");
    for(int x=0; x<h_C.width; x++) {
       printf("%f ", h_C.elements[y*h_C.width+x]);
    }
   }
  }


  double maxerror = 0.0;
  int errCnt = 0;
  for(int y=0; y<gold_C.height; y++){
   for(int x=0; x<gold_C.width; x++) {
      float it = gold_C.elements[y*gold_C.width+x];
      if(fabs(it - h_C.elements[y*h_C.width+x])> epsilon*it){
       errCnt++;
       double error = fabs(it - h_C.elements[y*h_C.width+x])/it;
       if (error > maxerror) maxerror = error;
      }      
   }
  }

  if(errCnt>0){
    printf("\n\nTEST FAILED: number of errors:  %d, max rel error: %f\n", errCnt, maxerror);
  }
}