#include "cuda/kernels/graph.h"

#include <stdio.h>
 
const int N = 16; 
const int blocksize = 16;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ 
void hello(char *a, int *b) 
{
    a[threadIdx.x] += b[threadIdx.x];
}

void helloWorldCuda()
{
    printf("CUDA TEST START\n");
    printf("Should print \"Hello World\"\n");

    char a[N] = "Hello \0\0\0\0\0\0";
    int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    char c[N] = "YELLO \0\0\0\0\0\0";
 
    char *ad;
    int *bd;
    const int csize = N*sizeof(char);
    const int isize = N*sizeof(int);
 
    printf("%s", a);
 
    cudaMalloc( (void**)&ad, csize ); 
    cudaMalloc( (void**)&bd, isize ); 
    cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
    cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 
    
    dim3 dimBlock( blocksize, 1 );
    dim3 dimGrid( 1, 1 );
    hello<<<dimGrid, dimBlock>>>(ad, bd);

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    cudaMemcpy( c, ad, csize, cudaMemcpyDeviceToHost ); 
    cudaFree( ad );
    cudaFree( bd );
    
    printf("%s\n", c);
    printf("CUDA TEST END\n");
}