/* Add two Vectors A and B in C on GPU
 * Initialize A and B on host
 * 2 D Grid 
 * 2 D thread block
 * each thread adds 1K consecutive elements
 *
 */

// Includes
#include <stdio.h>
#include <assert.h>
#include <cutil.h>

// Variables
float* h_A; 
float* h_B; 
float* h_C; 
float* d_A; 
float* d_B; 
float* d_C; 

// Functions
void Cleanup(bool);
void ParseArguments(int, char**);
void checkCUDAError(const char *msg);

// Device code
__global__ void AddVec(const float* A, const float* B, float* C, int N)
{
// do it
}

// Host code
int main(int argc, char** argv)
{
    int GrWdth; // Grid width
    int GrHght; // Grid height
    int BlWdth; // Block width
    int BlHght; // Block height
    int NVpThr; // number of values per thread


    if(argc != 6){
     printf("Need 5 params: GrWdth GrHght BlWdth BlHght nVperThr");
     exit(0);
    } else {
      sscanf(argv[1], "%d", &GrWdth);
      sscanf(argv[2], "%d", &GrHght);
      sscanf(argv[3], "%d", &BlWdth);
      sscanf(argv[4], "%d", &BlHght);
      sscanf(argv[5], "%d", &NVpThr);
    }      

    printf("Allocate Vectors A,B,C, initialize A and B on host, device adds the vectors consecutively\n");
    int N = GrWdth*GrHght*BlWdth*BlHght*NVpThr;  // fit the grid / block space, # vals per thread
    size_t size = N * sizeof(float);             // size in bytes
    dim3 dimGrid(GrWdth,GrHght);                    
    dim3 dimBlock(BlWdth,BlHght,1);                 


    cudaError_t error;

    // Allocate input vectors h_A and h_B in host memory
    h_A = (float*)malloc(size);
    if (h_A == 0) Cleanup(false);
    h_B = (float*)malloc(size);
    if (h_B == 0) Cleanup(false);
    h_C = (float*)malloc(size);
    if (h_C == 0) Cleanup(false);

    // Allocate vectors in device memory
    error = cudaMalloc((void**)&d_A, size);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMalloc((void**)&d_B, size);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMalloc((void**)&d_C, size);
    if (error != cudaSuccess) Cleanup(false);

    // Initialize h_A and h_B
    int i;
    for(i=0; i<N; ++i){
     h_A[i] = (float)i;
     h_B[i] = (float)(N-i);   
    }

    // transfer A and B to device
    error = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    if (error != cudaSuccess) Cleanup(false);

    // warm up
    AddVec<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    error = cudaGetLastError();
    if (error != cudaSuccess) Cleanup(false);

    // set up timer
    unsigned int timer = 0;
    cutCreateTimer( &timer);
    cutStartTimer( timer);

    // Invoke kernel

    AddVec<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    error = cudaGetLastError();
    if (error != cudaSuccess) Cleanup(false);
#ifdef _DEBUG
    error = cudaThreadSynchronize();
    if (error != cudaSuccess) Cleanup(false);
#endif
 
    cudaThreadSynchronize() ;
    cutStopTimer( timer);
    float time = cutGetTimerValue( timer);

    int nFlops = N;
    float nFlopsPerSec = 1e3*nFlops/time;
    float nGFlopsPerSec = nFlopsPerSec*1e-9;

    int nBytes = 3*4*N; // 2N words in, 1N word out
    float nBytesPerSec = 1e3*nBytes/time;
    float nGBytesPerSec = nBytesPerSec*1e-9;

    printf( "Time: %f (ms), GFlopsS: %f, GBytesS: %f\n", 
             time, nGFlopsPerSec, nGBytesPerSec);
     

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    error = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) Cleanup(false);

    // Verify result
    for (i = 0; i < N; ++i) {
        float val = h_C[i];
        if (fabs(val - N) > 1e-5)
            break;
    }
    printf("Test %s \n", (i == N) ? "PASSED" : "FAILED");

    cutDeleteTimer( timer);

    Cleanup(true);
}

void Cleanup(bool noError) {  // simplified version from CUDA SDK
    cudaError_t error;
        
    // Free device memories
    if (d_A)
        cudaFree(d_A);
    if (d_B)
        cudaFree(d_B);
    if (d_C)
        cudaFree(d_C);

    // Free host memory
    if (h_A)
        free(h_A);
    if (h_B)
        free(h_B);
    if (h_C)
        free(h_C);
        
    error = cudaThreadExit();
    
    if (!noError || error != cudaSuccess)
        printf("cuda malloc or cuda thread exit failed \n");
    
    fflush( stdout);
    fflush( stderr);

    exit(0);
}

void checkCUDAError(const char *msg)
{
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) 
    {
      fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
      exit(-1);
    }                         
}