#include <stdio.h>
 
const int N = 16; 
const int blocksize = 16;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ 
void hello(char *a, int *b) 
{
    a[threadIdx.x] += b[threadIdx.x];
}

namespace stormcuda {
    namespace graph {
        void helloWorld() {
            printf("CUDA TEST START\n");
            printf("Should print \"Hello World\"\n");

            char a[N] = "Hello \0\0\0\0\0\0";
            int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
            char c[N] = "YELLO \0\0\0\0\0\0";

            char *ad;
            int *bd;
            const int csize = N * sizeof(char);
            const int isize = N * sizeof(int);

            printf("%s", a);

            cudaMalloc((void **) &ad, csize);
            cudaMalloc((void **) &bd, isize);
            cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);
            cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);

            dim3 dimBlock(blocksize, 1);
            dim3 dimGrid(1, 1);
            hello << < dimGrid, dimBlock >> > (ad, bd);

            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());

            cudaMemcpy(c, ad, csize, cudaMemcpyDeviceToHost);
            cudaFree(ad);
            cudaFree(bd);

            printf("%s\n", c);
            printf("CUDA TEST END\n");
        }
    }
}