/* Add two Vectors A and B in C on GPU
 * Initialize A and B on host
 * 2 D Grid 
 * 2 D thread block
 * each thread adds 1K consecutive elements
 *
 */

// Includes
#include <stdio.h>
#include <assert.h>
#include <cutil.h>

// Variables
float* h_A; 
float* h_B; 
float* h_C; 
float* h_G; 
float* d_A; 
float* d_B; 
float* d_C; 

// Functions
void Cleanup(bool);
void ParseArguments(int, char**);

// Device code
__global__ void MulVec(const float* A, const float* B, float* C, int nVPT)
{
    int blockStartIndex = blockIdx.x*blockDim.x * nVPT;
    int thrStartIndex   = blockStartIndex + threadIdx.x;
    int endIndex        = blockStartIndex+nVPT*blockDim.x; // first index out
    int i;
    for( i=thrStartIndex; i < endIndex; i += blockDim.x){
        C[i] = A[i]*B[i];
    }
}

// Host code
int main(int argc, char** argv)
{
    int GrWdth; // Grid width
    int BlWdth; // Block width
    int NVpThr; // number of values per thread

    if(argc < 4){
     printf("Need 3 params: GrWdth BlWdth nVperThr");
     exit(0);
    } else {
      sscanf(argv[1], "%d", &GrWdth);
      sscanf(argv[2], "%d", &BlWdth);
      sscanf(argv[3], "%d", &NVpThr);
    }      

    int N = GrWdth*BlWdth*NVpThr;  // fit the grid / block space, # vals per thread
    size_t size = N * sizeof(float);             // size in bytes
    dim3 dimGrid(GrWdth);                    
    dim3 dimBlock(BlWdth);                 

    cudaError_t error;

    // Allocate input vectors h_A and h_B in host memory
    h_A = (float*)malloc(size);
    if (h_A == 0) Cleanup(false);
    h_B = (float*)malloc(size);
    if (h_B == 0) Cleanup(false);
    h_C = (float*)malloc(size);
    if (h_C == 0) Cleanup(false);
    h_G = (float*)malloc(size);
    if (h_G == 0) Cleanup(false);

    // Allocate vectors in device memory
    error = cudaMalloc((void**)&d_A, size);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMalloc((void**)&d_B, size);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMalloc((void**)&d_C, size);
    if (error != cudaSuccess) Cleanup(false);

    // Initialize h_X
    int i;
    for(i=0; i<N; ++i){
     h_A[i] = (float)i;
     h_B[i] = (float)(N-i);   
     h_C[i] = (float)0;   
     h_G[i] = (float)i*(N-i);
    }

    // transfer to device
    error = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (error != cudaSuccess) Cleanup(false);
    error = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    if (error != cudaSuccess) Cleanup(false);

    // warm up
    MulVec<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, NVpThr);
    error = cudaGetLastError();
    // if (error != cudaSuccess) Cleanup(false);
    
 
    // set up timer
    unsigned int timer = 0;
    cutCreateTimer( &timer);
    cutStartTimer( timer);


    // Invoke kernel

    MulVec<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, NVpThr);
    error = cudaGetLastError();
    if (error != cudaSuccess) Cleanup(false);

    cudaThreadSynchronize() ;
    cutStopTimer( timer);
    float time = cutGetTimerValue( timer);
    int nFlops = N;
    float nFlopsPerSec = 1e3*nFlops/time;
    float nGFlopsPerSec = nFlopsPerSec*1e-9;

    int nBytes = 3*4*N; // 2N words in, 1N word out
    float nBytesPerSec = 1e3*nBytes/time;
    float nGBytesPerSec = nBytesPerSec*1e-9;

    printf( "Time: %f (ms), GFlopsS: %f, GBytesS: %f\n", 
             time, nGFlopsPerSec, nGBytesPerSec);
     

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    error = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) Cleanup(false);

    // Verify result
    for (i = 0; i < N; ++i) {
        float val = h_C[i];
        if (fabs(val) - h_G[i] > 1e-5*h_G[i])
            break;
    }
    printf("Test %s \n", (i == N) ? "PASSED" : "FAILED");

    cutDeleteTimer( timer);
    
    Cleanup(true);
}

void Cleanup(bool noError) {  // simplified version from CUDA SDK
    cudaError_t error;
        
    // Free device memories
    if (d_A)
        cudaFree(d_A);
    if (d_B)
        cudaFree(d_B);
    if (d_C)
        cudaFree(d_C);

    // Free host memory
    if (h_A)
        free(h_A);
    if (h_B)
        free(h_B);
    if (h_C)
        free(h_C);
        
    error = cudaThreadExit();
    
    if (!noError || error != cudaSuccess)
        printf("cuda malloc or cuda thread exit failed \n");
    
    fflush( stdout);
    fflush( stderr);

    exit(0);
}