The source code and dockerfile for the GSW2024 AI Lab.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

285 lines
9.6 KiB

4 weeks ago
  1. #include <cuda.h>
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #include <chrono>
  5. #include <iostream>
  6. __global__ void cuda_kernel_basicAdd(int a, int b, int *c) {
  7. *c = a + b;
  8. }
  9. __global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
  10. // Fused Multiply Add:
  11. // A * B + C => D
  12. /*
  13. *Die Variable i dient f�r den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd
  14. *ausf�hrt, muss i f�r jeden Thread unterschiedlich sein. Ansonsten w�rden unterschiedliche
  15. *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente
  16. *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des
  17. *Threads, der die Funktion gerade ausf�hrt.
  18. */
  19. int i = blockDim.x * blockIdx.x + threadIdx.x;
  20. if (i < N) {
  21. D[i] = A[i] * B[i] + C[i];
  22. }
  23. }
  24. __global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) {
  25. // Fused Multiply Add:
  26. // A * B + C => D
  27. // Layout:
  28. // A B C D A B C D A B C D
  29. int i = blockDim.x * blockIdx.x + threadIdx.x;
  30. if ((i*M) < N) {
  31. for (int j = i*M; j < i*M + M; ++j) {
  32. A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2];
  33. }
  34. }
  35. }
  36. extern "C" int cuda_basicAdd(int a, int b) {
  37. int c = 0;
  38. int *dev_c;
  39. cudaMalloc((void**)&dev_c, sizeof(int));
  40. cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c);
  41. cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
  42. //printf("%d + %d + 42 is %d\n", a, b, c);
  43. cudaFree(dev_c);
  44. return c;
  45. }
  46. void cpp_cuda_bandwidthTest(int entryCount, int N) {
  47. // Size of the Arrays
  48. size_t arraySize = entryCount * sizeof(int);
  49. int* deviceIntArray;
  50. int* hostIntArray = new int[arraySize];
  51. // Allocate space on the device
  52. auto start_time = std::chrono::high_resolution_clock::now();
  53. for (int i = 0; i < N; ++i) {
  54. if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
  55. std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl;
  56. delete[] hostIntArray;
  57. return;
  58. }
  59. // Free memory on device
  60. if (cudaFree(deviceIntArray) != cudaSuccess) {
  61. std::cout << "Error in cudaFree!" << std::endl;
  62. delete[] hostIntArray;
  63. return;
  64. }
  65. }
  66. auto end_time = std::chrono::high_resolution_clock::now();
  67. auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
  68. double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
  69. std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
  70. std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl;
  71. if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
  72. std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl;
  73. delete[] hostIntArray;
  74. return;
  75. }
  76. // Prepare data
  77. for (int i = 0; i < N; ++i) {
  78. hostIntArray[i] = i * 333 + 123;
  79. }
  80. // Copy data TO device
  81. start_time = std::chrono::high_resolution_clock::now();
  82. for (int i = 0; i < N; ++i) {
  83. if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
  84. std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl;
  85. // Free memory on device
  86. if (cudaFree(deviceIntArray) != cudaSuccess) {
  87. std::cout << "Error in cudaFree!" << std::endl;
  88. }
  89. delete[] hostIntArray;
  90. return;
  91. }
  92. }
  93. end_time = std::chrono::high_resolution_clock::now();
  94. copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
  95. mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
  96. std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
  97. std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl;
  98. // Copy data FROM device
  99. start_time = std::chrono::high_resolution_clock::now();
  100. for (int i = 0; i < N; ++i) {
  101. if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
  102. std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl;
  103. // Free memory on device
  104. if (cudaFree(deviceIntArray) != cudaSuccess) {
  105. std::cout << "Error in cudaFree!" << std::endl;
  106. }
  107. delete[] hostIntArray;
  108. return;
  109. }
  110. }
  111. end_time = std::chrono::high_resolution_clock::now();
  112. copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
  113. mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
  114. std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
  115. std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl;
  116. // Free memory on device
  117. if (cudaFree(deviceIntArray) != cudaSuccess) {
  118. std::cout << "Error in cudaFree!" << std::endl;
  119. }
  120. delete[] hostIntArray;
  121. }
  122. extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
  123. // Size of the Arrays
  124. size_t arraySize = N * sizeof(int);
  125. int* deviceIntArrayA;
  126. int* deviceIntArrayB;
  127. int* deviceIntArrayC;
  128. int* deviceIntArrayD;
  129. // Allocate space on the device
  130. if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
  131. printf("Error in cudaMalloc1!\n");
  132. return;
  133. }
  134. if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) {
  135. printf("Error in cudaMalloc2!\n");
  136. cudaFree(deviceIntArrayA);
  137. return;
  138. }
  139. if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) {
  140. printf("Error in cudaMalloc3!\n");
  141. cudaFree(deviceIntArrayA);
  142. cudaFree(deviceIntArrayB);
  143. return;
  144. }
  145. if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) {
  146. printf("Error in cudaMalloc4!\n");
  147. cudaFree(deviceIntArrayA);
  148. cudaFree(deviceIntArrayB);
  149. cudaFree(deviceIntArrayC);
  150. return;
  151. }
  152. // Copy data TO device
  153. if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
  154. printf("Error in cudaMemcpy!\n");
  155. cudaFree(deviceIntArrayA);
  156. cudaFree(deviceIntArrayB);
  157. cudaFree(deviceIntArrayC);
  158. cudaFree(deviceIntArrayD);
  159. return;
  160. }
  161. if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
  162. printf("Error in cudaMemcpy!\n");
  163. cudaFree(deviceIntArrayA);
  164. cudaFree(deviceIntArrayB);
  165. cudaFree(deviceIntArrayC);
  166. cudaFree(deviceIntArrayD);
  167. return;
  168. }
  169. if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
  170. printf("Error in cudaMemcpy!\n");
  171. cudaFree(deviceIntArrayA);
  172. cudaFree(deviceIntArrayB);
  173. cudaFree(deviceIntArrayC);
  174. cudaFree(deviceIntArrayD);
  175. return;
  176. }
  177. // Festlegung der Threads pro Block
  178. int threadsPerBlock = 512;
  179. // Es werden soviele Bl�cke ben�tigt, dass alle Elemente der Vektoren abgearbeitet werden k�nnen
  180. int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  181. // Run kernel
  182. cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N);
  183. // Copy data FROM device
  184. if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
  185. printf("Error in cudaMemcpy!\n");
  186. cudaFree(deviceIntArrayA);
  187. cudaFree(deviceIntArrayB);
  188. cudaFree(deviceIntArrayC);
  189. cudaFree(deviceIntArrayD);
  190. return;
  191. }
  192. // Free memory on device
  193. cudaFree(deviceIntArrayA);
  194. cudaFree(deviceIntArrayB);
  195. cudaFree(deviceIntArrayC);
  196. cudaFree(deviceIntArrayD);
  197. }
  198. extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) {
  199. // Size of the Arrays
  200. size_t arraySize = N * sizeof(int) * 4;
  201. int* deviceIntArrayA;
  202. // Allocate space on the device
  203. if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
  204. printf("Error in cudaMalloc1!\n");
  205. return;
  206. }
  207. #define ONFAILFREE0() do { } while(0)
  208. #define ONFAILFREE1(a) do { cudaFree(a); } while(0)
  209. #define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0)
  210. #define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0)
  211. #define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0)
  212. #define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0)
  213. // Copy data TO device
  214. CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice);
  215. /*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
  216. printf("Error in cudaMemcpy!\n");
  217. cudaFree(deviceIntArrayA);
  218. return;
  219. }*/
  220. // Festlegung der Threads pro Block
  221. int threadsPerBlock = 512;
  222. // Es werden soviele Bl�cke ben�tigt, dass alle Elemente der Vektoren abgearbeitet werden k�nnen
  223. int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  224. // Run kernel
  225. cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M);
  226. // Copy data FROM device
  227. if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
  228. printf("Error in cudaMemcpy!\n");
  229. cudaFree(deviceIntArrayA);
  230. return;
  231. }
  232. // Free memory on device
  233. if (cudaFree(deviceIntArrayA) != cudaSuccess) {
  234. printf("Error in cudaFree!\n");
  235. return;
  236. }
  237. }
  238. extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
  239. for (int i = 0; i < N; ++i) {
  240. D[i] = A[i] * B[i] + C[i];
  241. }
  242. }
  243. extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) {
  244. for (int i = 0; i < N; i += 4) {
  245. A[i+3] = A[i] * A[i+1] + A[i+2];
  246. }
  247. }