/* Allocate Vector on GPU
 * Initialize on the device
 * 2 D Grid 
 * 2 D thread block, each thread writes its treadIdx.x + blockDim.x+threadIdx.y into one vector element
 *
 */

// Includes
#include <stdio.h>
#include <assert.h>

// Variables
float* h_V; // host     ALWAYS
float* d_V; // device

bool noprompt = false;

// Functions
void Cleanup(bool);
void ParseArguments(int, char**);

// Device code
__global__ void TouchVec(float* V, int N)
{
// do it
}

// Host code
int main(int argc, char** argv)
{
    printf("Allocate Vector, initialize it in GPU, grid 2D, block 2D \n");
    int N = 4*2*16*8;                // fit the grid / block space
    size_t size = N * sizeof(float); // size in bytes
    dim3 dimGrid(4,2);               // GridDx=4 width, GridDy=2 height
    dim3 dimBlock(16,8,1);           // BlockDx=16 width, BlockDy = 8 height


    cudaError_t error;

    // Allocate input vectors h_A and h_B in host memory
    h_V = (float*)malloc(size);
    if (h_V == 0) Cleanup(false);

    // Allocate vectors in device memory
    error = cudaMalloc((void**)&d_V, size);
    if (error != cudaSuccess) Cleanup(false);


    // Invoke kernel

    TouchVec<<<dimGrid, dimBlock>>>(d_V, N);
    error = cudaGetLastError();
    if (error != cudaSuccess) Cleanup(false);
#ifdef _DEBUG
    error = cudaThreadSynchronize();
    if (error != cudaSuccess) Cleanup(false);
#endif

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    error = cudaMemcpy(h_V, d_V, size, cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) Cleanup(false);

    // Verify result
    int i;
    for (i = 0; i < N; ++i) {
        float val = h_V[i];
        if (fabs(val - i) > 1e-5)
            break;
    }
    printf("Test %s \n", (i == N) ? "PASSED" : "FAILED");
    
    // Print Corner Values of first thread block
    
    int blockSize = dimBlock.x * dimBlock.y;     // # threads in one threadBlock
    int gridWidth = dimGrid.x;                   // # blocks in grid row
    int nThrPerGrRow = gridWidth * blockSize;    // # threads per grid row

    // Corners block 0 0
    int blkIdxX = 0;
    int blkIdxY = 0;
    int thrIdxX = 0;
    int thrIdxY = 0;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    float V00 = h_V[i];
    thrIdxX = 15;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    float V150 = h_V[i];
    thrIdxX = 0;
    thrIdxY = 7;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    float V07 = h_V[i];
    printf("Corner values in thread block Bx=0 By=0\n"); 
    printf("(Tx=0,Ty0) = %f, G(Tx=15, Ty=0) = %f, G(Tx=0,Ty=7)(0) = %f\n", V00, V150, V07);

    // Corners block 3 0
    blkIdxX = 3;
    thrIdxY = 0;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V00 = h_V[i];
    thrIdxX = 15;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V150 = h_V[i];
    thrIdxY = 7;
    thrIdxX = 0;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V07 = h_V[i];
    printf("Corner values in thread block Bx=3 By=0\n"); 
    printf("G(Tx=0,Ty=0) = %f, G(Tx=15,Ty=0) = %f, G(Tx=0,Ty=7) = %f\n", V00, V150, V07);

    // Corners block 0 1
    blkIdxX = 0;
    blkIdxY = 1;
    thrIdxY = 0;
    thrIdxX = 0;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V00 = h_V[i];
    thrIdxX = 15;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V150 = h_V[i];
    thrIdxY = 7;
    thrIdxX = 0;
    i = blkIdxY*nThrPerGrRow +  blkIdxX*blockSize + thrIdxY * dimBlock.x +  thrIdxX;
    V07 = h_V[i];
    printf("Corner values in thread block Bx=0 By=1\n"); 
    printf("G(Tx=0,Ty=0) = %f, G(Tx=15,Ty=0) = %f, G(Tx=0,Ty=7) = %f\n", V00, V150, V07);


    Cleanup(true);
}

void Cleanup(bool noError) {  // simplified version from CUDA SDK
    cudaError_t error;
        
    // Free device memory
    if (d_V)
        cudaFree(d_V);

    // Free host memory
    if (h_V)
        free(h_V);
        
    error = cudaThreadExit();
    
    if (!noError || error != cudaSuccess)
        printf("cuda malloc or cuda thread exit failed \n");
    
    fflush( stdout);
    fflush( stderr);

    exit(0);
}