// Copied from the Cuda Programming Guide
// main added

// Includes
#include <stdio.h>
#include <assert.h>
#include <cutil.h>
#define BLOCK_SIZE 2
#define epsilon (float)1e-5
#define verbose 1

// Matrices stored in row major order: M[row,col] = *(M.elements + row * M.width + col)
typedef struct {
  int width;
  int height;
  float* elements;
} Matrix;


__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Host code, matrix dimensions must be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C){
  Matrix d_A; 
  d_A.width = A.width; d_A.height = A.height;
  size_t size = A.width * A.height * sizeof(float);
  cudaMalloc((void**) &d_A.elements, size);
  cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

  Matrix d_B;
  d_B.width = B.width; d_B.height = B.height;
  size = B.width * B.height * sizeof(float);
  cudaMalloc((void**) &d_B.elements, size);
  cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);

  Matrix d_C;
  d_C.width = C.width; d_C.height = C.height;
  size = C.width * C.height * sizeof(float);
  cudaMalloc((void**) &d_C.elements, size);

  // Invoke kernel
  dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
  dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

  cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);

  // Free device memory
  cudaFree(d_A.elements);
  cudaFree(d_B.elements);
  cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
  // each thread computes one element of C
  float Cvalue = 0;
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  for (int e = 0; e < A.width; ++e)
     Cvalue += A.elements[row*A.width + e] * B.elements[e*B.width + col];

  C.elements[row*C.width + col] = Cvalue;

}

// main

int main(int argc, char** argv) {
  
  Matrix h_A;
  h_A.width = 3*BLOCK_SIZE; h_A.height = 2*BLOCK_SIZE;
  size_t size = h_A.width*h_A.height*sizeof(float);
  h_A.elements = (float*)malloc(size);


  Matrix h_B;
  h_B.width = 4*BLOCK_SIZE; h_B.height = 3*BLOCK_SIZE;
  size = h_B.width*h_B.height*sizeof(float);
  h_B.elements = (float*)malloc(size);


  Matrix h_C;
  h_C.width = 4*BLOCK_SIZE; h_C.height = 2*BLOCK_SIZE;
  size = h_C.width*h_C.height*sizeof(float);
  h_C.elements = (float*)malloc(size);


  Matrix gold_C;
  gold_C.width = 4*BLOCK_SIZE; gold_C.height = 2*BLOCK_SIZE;
  gold_C.elements = (float*)malloc(size);

  for(int y=0; y<h_A.height; y++)
   for(int x=0; x<h_A.width; x++) {
      h_A.elements[y*h_A.width+x] = y;
   }
 
  if(verbose){
  // print h_A
  printf("\nh_A\n");
  for(int y=0; y<h_A.height; y++){
   printf("\n");
   for(int x=0; x<h_A.width; x++) {
      printf("%f ", h_A.elements[y*h_A.width+x]);
   }
  }
  }

  for(int y=0; y<h_B.height; y++)
   for(int x=0; x<h_B.width; x++) {
      h_B.elements[y*h_B.width+x] = x;
   }
 
  if(verbose){
   // print h_B
   printf("\nh_B\n");
   for(int y=0; y<h_B.height; y++){
    printf("\n");
    for(int x=0; x<h_B.width; x++) {
       printf("%f ", h_B.elements[y*h_B.width+x]);
    }
   }
  }


  for(int y=0; y<h_C.height; y++)
   for(int x=0; x<h_C.width; x++) {
      h_C.elements[y*h_C.width+x] = 0.0;
   }


  MatMul(h_A,h_B,h_C);

  for(int y=0; y<h_A.height; y++)
   for(int x=0; x<h_B.width; x++) {
    float Cvalue = (float)0.0;
    for(int k=0; k<h_B.height; k++) 
      Cvalue +=  h_A.elements[y*h_A.width+k] * h_B.elements[k*h_B.width+x];
    gold_C.elements[y*gold_C.width+x] = Cvalue;
  }

  if(verbose){
   // print gold_C
   printf("\n\ngold_C");
   for(int y=0; y<gold_C.height; y++){
    printf("\n");
    for(int x=0; x<gold_C.width; x++) {
       printf("%f ", gold_C.elements[y*gold_C.width+x]);
    }
   }

   // print h_C
   printf("\n\n\nh_C");
   for(int y=0; y<h_C.height; y++){
    printf("\n");
    for(int x=0; x<h_C.width; x++) {
       printf("%f ", h_C.elements[y*h_C.width+x]);
    }
   }
  }

  int errCnt = 0;
  for(int y=0; y<gold_C.height; y++){
   for(int x=0; x<gold_C.width; x++) {
      float it = gold_C.elements[y*gold_C.width+x];
      if(fabs(it - h_C.elements[y*h_C.width+x])> epsilon*it)
       errCnt++;
      
   }
  }

  if(errCnt==0)
    printf("\nTEST PASSED\n");
  else
    printf("\n\nTEST FAILED: number of errors:  %d\n", errCnt);

}