/* standard libraries */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/* cuda blas libraries */
#include <cublas.h>

int main(int narg, char **arg)
{
  int ntimes; /* number of multiplications to perform */
  int nrows;  /* number of rows & cols in each matrix */

  cudaSetDevice(1);

  /* parse args */
  /* quick n' dirty */
  if (narg < 2)
    ntimes = 50;
  else
    ntimes = atoi(arg[1]);

  if (narg < 3)
    nrows = 2000;
  else
    nrows = atoi(arg[2]);

  /* total number of elements in each matrix */
  const int num_elements = nrows*nrows;

  unsigned int i;                   /* general purpose counter */
  struct timespec start_ts, end_ts; /* structs needed by clock_gettime */
  double start_time, end_time;      /* actual start and stop times in secs */
  double elapsed_time;              /* elapsed time in secs */

  float *M; /* initial/final matrix in host memory */

  cublasStatus status;  /* cublas status struct */
  float *d_M, *d_Z;     /* working matrices in device memory */
  float *d_temp;        /* temp matrix in device memory */

  /* allocate memory on host */
  M = (float*)malloc(sizeof(float)*num_elements);
  if (M == NULL) {
    fprintf(stderr, "Host memory allocation failed! Aborting.\n");
    exit(1);
  }

  /* seed random number generator */
  //srand(1);
  srand((unsigned int)time(NULL));

  /* fill in M with random numbers */
  for (i = 0; i < num_elements; ++i)
    M[i] = 0.001 * ((float)rand() / RAND_MAX);

  /* initialize cublas */
  status = cublasInit();
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "cublas failed to initialize! Aborting.\n");
    exit(1);
  }

  /* allocate memory on cuda device */
  status = cublasAlloc(num_elements, sizeof(float), (void**)&d_M);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "Device memory allocation failed! Aborting.\n");
    exit(1);
  }

  status = cublasAlloc(num_elements, sizeof(float), (void**)&d_Z);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "Device memory allocation failed! Aborting.\n");
    exit(1);
  }

  status = cublasAlloc(num_elements, sizeof(float), (void**)&d_temp);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "Device memory allocation failed! Aborting.\n");
    exit(1);
  }

  /* start clock */
  clock_gettime(CLOCK_REALTIME, &start_ts);

  /* copy M to d_M */
  status = cublasSetMatrix(nrows, nrows, sizeof(float),
                           M, nrows, d_M, nrows);
  /* status check skipped for performance */
  /*if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "Copy to device memory failed! Aborting.\n");
    exit(1);
  }*/

  /* copy d_M to d_Z */
  cublasScopy(num_elements, d_M, 1, d_Z, 1);

  /* perform multiplications on device */
  for (i = 0; i < ntimes; ++i)
  {
    /* multiply d_Z by d_M and store in d_temp */
    cublasSgemm('n', 'n', nrows, nrows, nrows,
                1.0f, d_Z, nrows, d_M, nrows, 0.0f,
                d_temp, nrows);

    /* status check skipped for performance */
    /*if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "sgemm failed to execute! Aborting.\n");
      exit(1);
    }*/

    /* copy d_temp to d_Z */
    cublasScopy(num_elements, d_temp, 1, d_Z, 1);
    /* status check skipped for performance */
    /*if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "scopy failed to execute! Aborting.\n");
      exit(1);
    }*/
  }

  /* copy results back to host memory */
  status = cublasGetMatrix(nrows, nrows, sizeof(float),
                           d_Z, nrows, M, nrows);
  /* status check skipped for performance */
  /*if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "Copy from device memory failed! Aborting.\n");
    exit(1);
  }*/

  /* print first value in result */
  printf("%f\n", M[0]);

  /* end clock */
  clock_gettime(CLOCK_REALTIME, &end_ts);

  /* figure final times */
  start_time = (double)start_ts.tv_sec + ((double)start_ts.tv_nsec)*0.000000001;
  end_time   = (double)end_ts.tv_sec   + ((double)end_ts.tv_nsec)*0.000000001;
  elapsed_time = end_time - start_time;

  /* print results */
  printf("-------------------------------------------------cublas--- %lf\n", elapsed_time);

  /* free host memory */
  free(M);

  /* free device memory */
  status = cublasFree(d_M);
  if (status != CUBLAS_STATUS_SUCCESS)
      fprintf (stderr, "Failed to free device memory.\n");

  status = cublasFree(d_Z);
  if (status != CUBLAS_STATUS_SUCCESS)
      fprintf (stderr, "Failed to free device memory.\n");

  status = cublasFree(d_temp);
  if (status != CUBLAS_STATUS_SUCCESS)
      fprintf (stderr, "Failed to free device memory.\n");

  /* teardown cublas session */
  if (cublasShutdown() != CUBLAS_STATUS_SUCCESS)
    fprintf (stderr, "Failed to shut down cublas.\n");

  /* have a nice day! */
  return 0;
}
