You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

878 lines
40 KiB

4 months ago
  1. #include "basicValueIteration.h"
  2. #define CUSP_USE_TEXTURE_MEMORY
  3. #include <iostream>
  4. #include <chrono>
  5. #include <cuda_runtime.h>
  6. #include "cusparse_v2.h"
  7. #include "utility.h"
  8. #include "cuspExtension.h"
  9. #include <thrust/transform.h>
  10. #include <thrust/device_ptr.h>
  11. #include <thrust/functional.h>
  12. #include "storm-cudaplugin-config.h"
  13. #ifdef DEBUG
  14. #define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
  15. #else
  16. #define CUDA_CHECK_ALL_ERRORS() do {} while (false)
  17. #endif
  18. template<typename T, bool Relative>
  19. struct equalModuloPrecision : public thrust::binary_function<T,T,T>
  20. {
  21. __host__ __device__ T operator()(const T &x, const T &y) const
  22. {
  23. if (Relative) {
  24. if (y == 0) {
  25. return ((x >= 0) ? (x) : (-x));
  26. }
  27. const T result = (x - y) / y;
  28. return ((result >= 0) ? (result) : (-result));
  29. } else {
  30. const T result = (x - y);
  31. return ((result >= 0) ? (result) : (-result));
  32. }
  33. }
  34. };
  35. template<typename IndexType, typename ValueType>
  36. void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
  37. indexVector.reserve(inputVector.size());
  38. valueVector.reserve(inputVector.size());
  39. for (size_t i = 0; i < inputVector.size(); ++i) {
  40. indexVector.push_back(inputVector.at(i).first);
  41. valueVector.push_back(inputVector.at(i).second);
  42. }
  43. }
  44. // TEMPLATE VERSION
  45. template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
  46. bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
  47. //std::vector<IndexType> matrixColumnIndices;
  48. //std::vector<ValueType> matrixValues;
  49. //exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
  50. bool errorOccured = false;
  51. IndexType* device_matrixRowIndices = nullptr;
  52. ValueType* device_matrixColIndicesAndValues = nullptr;
  53. ValueType* device_x = nullptr;
  54. ValueType* device_xSwap = nullptr;
  55. ValueType* device_b = nullptr;
  56. ValueType* device_multiplyResult = nullptr;
  57. IndexType* device_nondeterministicChoiceIndices = nullptr;
  58. #ifdef DEBUG
  59. std::cout.sync_with_stdio(true);
  60. std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
  61. std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
  62. size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
  63. std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
  64. #endif
  65. const IndexType matrixRowCount = matrixRowIndices.size() - 1;
  66. const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
  67. const IndexType matrixNnzCount = columnIndicesAndValues.size();
  68. cudaError_t cudaMallocResult;
  69. bool converged = false;
  70. iterationCount = 0;
  71. CUDA_CHECK_ALL_ERRORS();
  72. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
  73. if (cudaMallocResult != cudaSuccess) {
  74. std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
  75. errorOccured = true;
  76. goto cleanup;
  77. }
  78. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  79. #define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
  80. #else
  81. #define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
  82. #endif
  83. if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
  84. CUDA_CHECK_ALL_ERRORS();
  85. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
  86. if (cudaMallocResult != cudaSuccess) {
  87. std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
  88. errorOccured = true;
  89. goto cleanup;
  90. }
  91. } else {
  92. CUDA_CHECK_ALL_ERRORS();
  93. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
  94. if (cudaMallocResult != cudaSuccess) {
  95. std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
  96. errorOccured = true;
  97. goto cleanup;
  98. }
  99. }
  100. CUDA_CHECK_ALL_ERRORS();
  101. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
  102. if (cudaMallocResult != cudaSuccess) {
  103. std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
  104. errorOccured = true;
  105. goto cleanup;
  106. }
  107. CUDA_CHECK_ALL_ERRORS();
  108. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
  109. if (cudaMallocResult != cudaSuccess) {
  110. std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
  111. errorOccured = true;
  112. goto cleanup;
  113. }
  114. CUDA_CHECK_ALL_ERRORS();
  115. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
  116. if (cudaMallocResult != cudaSuccess) {
  117. std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
  118. errorOccured = true;
  119. goto cleanup;
  120. }
  121. CUDA_CHECK_ALL_ERRORS();
  122. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
  123. if (cudaMallocResult != cudaSuccess) {
  124. std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
  125. errorOccured = true;
  126. goto cleanup;
  127. }
  128. CUDA_CHECK_ALL_ERRORS();
  129. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1));
  130. if (cudaMallocResult != cudaSuccess) {
  131. std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
  132. errorOccured = true;
  133. goto cleanup;
  134. }
  135. #ifdef DEBUG
  136. std::cout << "(DLL) Finished allocating memory." << std::endl;
  137. #endif
  138. // Memory allocated, copy data to device
  139. cudaError_t cudaCopyResult;
  140. CUDA_CHECK_ALL_ERRORS();
  141. cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
  142. if (cudaCopyResult != cudaSuccess) {
  143. std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
  144. errorOccured = true;
  145. goto cleanup;
  146. }
  147. // Copy all data as floats are expanded to 64bits :/
  148. if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
  149. CUDA_CHECK_ALL_ERRORS();
  150. cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
  151. if (cudaCopyResult != cudaSuccess) {
  152. std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
  153. errorOccured = true;
  154. goto cleanup;
  155. }
  156. } else {
  157. CUDA_CHECK_ALL_ERRORS();
  158. cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
  159. if (cudaCopyResult != cudaSuccess) {
  160. std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
  161. errorOccured = true;
  162. goto cleanup;
  163. }
  164. }
  165. CUDA_CHECK_ALL_ERRORS();
  166. cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
  167. if (cudaCopyResult != cudaSuccess) {
  168. std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
  169. errorOccured = true;
  170. goto cleanup;
  171. }
  172. // Preset the xSwap to zeros...
  173. CUDA_CHECK_ALL_ERRORS();
  174. cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
  175. if (cudaCopyResult != cudaSuccess) {
  176. std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
  177. errorOccured = true;
  178. goto cleanup;
  179. }
  180. CUDA_CHECK_ALL_ERRORS();
  181. cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
  182. if (cudaCopyResult != cudaSuccess) {
  183. std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
  184. errorOccured = true;
  185. goto cleanup;
  186. }
  187. // Preset the multiplyResult to zeros...
  188. CUDA_CHECK_ALL_ERRORS();
  189. cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
  190. if (cudaCopyResult != cudaSuccess) {
  191. std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
  192. errorOccured = true;
  193. goto cleanup;
  194. }
  195. CUDA_CHECK_ALL_ERRORS();
  196. cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice);
  197. if (cudaCopyResult != cudaSuccess) {
  198. std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
  199. errorOccured = true;
  200. goto cleanup;
  201. }
  202. #ifdef DEBUG
  203. std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
  204. #endif
  205. // Data is on device, start Kernel
  206. while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
  207. cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
  208. CUDA_CHECK_ALL_ERRORS();
  209. thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
  210. thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
  211. // Transform: Add multiplyResult + b inplace to multiplyResult
  212. thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
  213. CUDA_CHECK_ALL_ERRORS();
  214. // Reduce: Reduce multiplyResult to a new x vector
  215. cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
  216. CUDA_CHECK_ALL_ERRORS();
  217. // Check for convergence
  218. // Transform: x = abs(x - xSwap)/ xSwap
  219. thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
  220. thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
  221. thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
  222. thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
  223. CUDA_CHECK_ALL_ERRORS();
  224. // Reduce: get Max over x and check for res < Precision
  225. ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
  226. CUDA_CHECK_ALL_ERRORS();
  227. converged = (maxX < precision);
  228. ++iterationCount;
  229. // Swap pointers, device_x always contains the most current result
  230. std::swap(device_x, device_xSwap);
  231. }
  232. if (!converged && (iterationCount == maxIterationCount)) {
  233. iterationCount = 0;
  234. errorOccured = true;
  235. }
  236. #ifdef DEBUG
  237. std::cout << "(DLL) Finished kernel execution." << std::endl;
  238. std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
  239. #endif
  240. // Get x back from the device
  241. cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
  242. if (cudaCopyResult != cudaSuccess) {
  243. std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
  244. errorOccured = true;
  245. goto cleanup;
  246. }
  247. #ifdef DEBUG
  248. std::cout << "(DLL) Finished copying result data." << std::endl;
  249. #endif
  250. // All code related to freeing memory and clearing up the device
  251. cleanup:
  252. if (device_matrixRowIndices != nullptr) {
  253. cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
  254. if (cudaFreeResult != cudaSuccess) {
  255. std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
  256. errorOccured = true;
  257. }
  258. device_matrixRowIndices = nullptr;
  259. }
  260. if (device_matrixColIndicesAndValues != nullptr) {
  261. cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
  262. if (cudaFreeResult != cudaSuccess) {
  263. std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
  264. errorOccured = true;
  265. }
  266. device_matrixColIndicesAndValues = nullptr;
  267. }
  268. if (device_x != nullptr) {
  269. cudaError_t cudaFreeResult = cudaFree(device_x);
  270. if (cudaFreeResult != cudaSuccess) {
  271. std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
  272. errorOccured = true;
  273. }
  274. device_x = nullptr;
  275. }
  276. if (device_xSwap != nullptr) {
  277. cudaError_t cudaFreeResult = cudaFree(device_xSwap);
  278. if (cudaFreeResult != cudaSuccess) {
  279. std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
  280. errorOccured = true;
  281. }
  282. device_xSwap = nullptr;
  283. }
  284. if (device_b != nullptr) {
  285. cudaError_t cudaFreeResult = cudaFree(device_b);
  286. if (cudaFreeResult != cudaSuccess) {
  287. std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
  288. errorOccured = true;
  289. }
  290. device_b = nullptr;
  291. }
  292. if (device_multiplyResult != nullptr) {
  293. cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
  294. if (cudaFreeResult != cudaSuccess) {
  295. std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
  296. errorOccured = true;
  297. }
  298. device_multiplyResult = nullptr;
  299. }
  300. if (device_nondeterministicChoiceIndices != nullptr) {
  301. cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices);
  302. if (cudaFreeResult != cudaSuccess) {
  303. std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl;
  304. errorOccured = true;
  305. }
  306. device_nondeterministicChoiceIndices = nullptr;
  307. }
  308. #ifdef DEBUG
  309. std::cout << "(DLL) Finished cleanup." << std::endl;
  310. #endif
  311. return !errorOccured;
  312. }
  313. template <typename IndexType, typename ValueType>
  314. void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
  315. IndexType* device_matrixRowIndices = nullptr;
  316. ValueType* device_matrixColIndicesAndValues = nullptr;
  317. ValueType* device_x = nullptr;
  318. ValueType* device_multiplyResult = nullptr;
  319. #ifdef DEBUG
  320. std::cout.sync_with_stdio(true);
  321. std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
  322. std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
  323. size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
  324. std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
  325. #endif
  326. const IndexType matrixRowCount = matrixRowIndices.size() - 1;
  327. const IndexType matrixNnzCount = columnIndicesAndValues.size();
  328. cudaError_t cudaMallocResult;
  329. CUDA_CHECK_ALL_ERRORS();
  330. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
  331. if (cudaMallocResult != cudaSuccess) {
  332. std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
  333. goto cleanup;
  334. }
  335. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  336. CUDA_CHECK_ALL_ERRORS();
  337. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
  338. if (cudaMallocResult != cudaSuccess) {
  339. std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
  340. goto cleanup;
  341. }
  342. #else
  343. CUDA_CHECK_ALL_ERRORS();
  344. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
  345. if (cudaMallocResult != cudaSuccess) {
  346. std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
  347. goto cleanup;
  348. }
  349. #endif
  350. CUDA_CHECK_ALL_ERRORS();
  351. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
  352. if (cudaMallocResult != cudaSuccess) {
  353. std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
  354. goto cleanup;
  355. }
  356. CUDA_CHECK_ALL_ERRORS();
  357. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
  358. if (cudaMallocResult != cudaSuccess) {
  359. std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
  360. goto cleanup;
  361. }
  362. #ifdef DEBUG
  363. std::cout << "(DLL) Finished allocating memory." << std::endl;
  364. #endif
  365. // Memory allocated, copy data to device
  366. cudaError_t cudaCopyResult;
  367. CUDA_CHECK_ALL_ERRORS();
  368. cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
  369. if (cudaCopyResult != cudaSuccess) {
  370. std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
  371. goto cleanup;
  372. }
  373. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  374. CUDA_CHECK_ALL_ERRORS();
  375. cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
  376. if (cudaCopyResult != cudaSuccess) {
  377. std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
  378. goto cleanup;
  379. }
  380. #else
  381. CUDA_CHECK_ALL_ERRORS();
  382. cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
  383. if (cudaCopyResult != cudaSuccess) {
  384. std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
  385. goto cleanup;
  386. }
  387. #endif
  388. CUDA_CHECK_ALL_ERRORS();
  389. cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
  390. if (cudaCopyResult != cudaSuccess) {
  391. std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
  392. goto cleanup;
  393. }
  394. // Preset the multiplyResult to zeros...
  395. CUDA_CHECK_ALL_ERRORS();
  396. cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
  397. if (cudaCopyResult != cudaSuccess) {
  398. std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
  399. goto cleanup;
  400. }
  401. #ifdef DEBUG
  402. std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
  403. #endif
  404. cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
  405. CUDA_CHECK_ALL_ERRORS();
  406. #ifdef DEBUG
  407. std::cout << "(DLL) Finished kernel execution." << std::endl;
  408. #endif
  409. // Get result back from the device
  410. cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
  411. if (cudaCopyResult != cudaSuccess) {
  412. std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
  413. goto cleanup;
  414. }
  415. #ifdef DEBUG
  416. std::cout << "(DLL) Finished copying result data." << std::endl;
  417. #endif
  418. // All code related to freeing memory and clearing up the device
  419. cleanup:
  420. if (device_matrixRowIndices != nullptr) {
  421. cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
  422. if (cudaFreeResult != cudaSuccess) {
  423. std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
  424. }
  425. device_matrixRowIndices = nullptr;
  426. }
  427. if (device_matrixColIndicesAndValues != nullptr) {
  428. cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
  429. if (cudaFreeResult != cudaSuccess) {
  430. std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
  431. }
  432. device_matrixColIndicesAndValues = nullptr;
  433. }
  434. if (device_x != nullptr) {
  435. cudaError_t cudaFreeResult = cudaFree(device_x);
  436. if (cudaFreeResult != cudaSuccess) {
  437. std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
  438. }
  439. device_x = nullptr;
  440. }
  441. if (device_multiplyResult != nullptr) {
  442. cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
  443. if (cudaFreeResult != cudaSuccess) {
  444. std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
  445. }
  446. device_multiplyResult = nullptr;
  447. }
  448. #ifdef DEBUG
  449. std::cout << "(DLL) Finished cleanup." << std::endl;
  450. #endif
  451. }
  452. template <typename ValueType>
  453. void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
  454. ValueType* device_a = nullptr;
  455. ValueType* device_b = nullptr;
  456. const size_t vectorSize = std::max(a.size(), b.size());
  457. cudaError_t cudaMallocResult;
  458. CUDA_CHECK_ALL_ERRORS();
  459. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
  460. if (cudaMallocResult != cudaSuccess) {
  461. std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
  462. goto cleanup;
  463. }
  464. CUDA_CHECK_ALL_ERRORS();
  465. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
  466. if (cudaMallocResult != cudaSuccess) {
  467. std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
  468. goto cleanup;
  469. }
  470. // Memory allocated, copy data to device
  471. cudaError_t cudaCopyResult;
  472. CUDA_CHECK_ALL_ERRORS();
  473. cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
  474. if (cudaCopyResult != cudaSuccess) {
  475. std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
  476. goto cleanup;
  477. }
  478. CUDA_CHECK_ALL_ERRORS();
  479. cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
  480. if (cudaCopyResult != cudaSuccess) {
  481. std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
  482. goto cleanup;
  483. }
  484. do {
  485. // Transform: Add multiplyResult + b inplace to multiplyResult
  486. thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
  487. thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
  488. thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
  489. CUDA_CHECK_ALL_ERRORS();
  490. } while (false);
  491. // Get result back from the device
  492. cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
  493. if (cudaCopyResult != cudaSuccess) {
  494. std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
  495. goto cleanup;
  496. }
  497. // All code related to freeing memory and clearing up the device
  498. cleanup:
  499. if (device_a != nullptr) {
  500. cudaError_t cudaFreeResult = cudaFree(device_a);
  501. if (cudaFreeResult != cudaSuccess) {
  502. std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
  503. }
  504. device_a = nullptr;
  505. }
  506. if (device_b != nullptr) {
  507. cudaError_t cudaFreeResult = cudaFree(device_b);
  508. if (cudaFreeResult != cudaSuccess) {
  509. std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
  510. }
  511. device_b = nullptr;
  512. }
  513. }
  514. template <typename IndexType, typename ValueType, bool Minimize>
  515. void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
  516. ValueType* device_groupedVector = nullptr;
  517. IndexType* device_grouping = nullptr;
  518. ValueType* device_target = nullptr;
  519. const size_t groupedSize = groupedVector.size();
  520. const size_t groupingSize = grouping.size();
  521. const size_t targetSize = targetVector.size();
  522. cudaError_t cudaMallocResult;
  523. CUDA_CHECK_ALL_ERRORS();
  524. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
  525. if (cudaMallocResult != cudaSuccess) {
  526. std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
  527. goto cleanup;
  528. }
  529. CUDA_CHECK_ALL_ERRORS();
  530. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
  531. if (cudaMallocResult != cudaSuccess) {
  532. std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
  533. goto cleanup;
  534. }
  535. CUDA_CHECK_ALL_ERRORS();
  536. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
  537. if (cudaMallocResult != cudaSuccess) {
  538. std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
  539. goto cleanup;
  540. }
  541. // Memory allocated, copy data to device
  542. cudaError_t cudaCopyResult;
  543. CUDA_CHECK_ALL_ERRORS();
  544. cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
  545. if (cudaCopyResult != cudaSuccess) {
  546. std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
  547. goto cleanup;
  548. }
  549. CUDA_CHECK_ALL_ERRORS();
  550. cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
  551. if (cudaCopyResult != cudaSuccess) {
  552. std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
  553. goto cleanup;
  554. }
  555. do {
  556. // Reduce: Reduce multiplyResult to a new x vector
  557. cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
  558. CUDA_CHECK_ALL_ERRORS();
  559. } while (false);
  560. // Get result back from the device
  561. cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
  562. if (cudaCopyResult != cudaSuccess) {
  563. std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
  564. goto cleanup;
  565. }
  566. // All code related to freeing memory and clearing up the device
  567. cleanup:
  568. if (device_groupedVector != nullptr) {
  569. cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
  570. if (cudaFreeResult != cudaSuccess) {
  571. std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
  572. }
  573. device_groupedVector = nullptr;
  574. }
  575. if (device_grouping != nullptr) {
  576. cudaError_t cudaFreeResult = cudaFree(device_grouping);
  577. if (cudaFreeResult != cudaSuccess) {
  578. std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
  579. }
  580. device_grouping = nullptr;
  581. }
  582. if (device_target != nullptr) {
  583. cudaError_t cudaFreeResult = cudaFree(device_target);
  584. if (cudaFreeResult != cudaSuccess) {
  585. std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
  586. }
  587. device_target = nullptr;
  588. }
  589. }
  590. template <typename ValueType, bool Relative>
  591. void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
  592. ValueType* device_x = nullptr;
  593. ValueType* device_y = nullptr;
  594. const size_t vectorSize = x.size();
  595. cudaError_t cudaMallocResult;
  596. CUDA_CHECK_ALL_ERRORS();
  597. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
  598. if (cudaMallocResult != cudaSuccess) {
  599. std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
  600. goto cleanup;
  601. }
  602. CUDA_CHECK_ALL_ERRORS();
  603. cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
  604. if (cudaMallocResult != cudaSuccess) {
  605. std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
  606. goto cleanup;
  607. }
  608. // Memory allocated, copy data to device
  609. cudaError_t cudaCopyResult;
  610. CUDA_CHECK_ALL_ERRORS();
  611. cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
  612. if (cudaCopyResult != cudaSuccess) {
  613. std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
  614. goto cleanup;
  615. }
  616. CUDA_CHECK_ALL_ERRORS();
  617. cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
  618. if (cudaCopyResult != cudaSuccess) {
  619. std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
  620. goto cleanup;
  621. }
  622. do {
  623. // Transform: x = abs(x - xSwap)/ xSwap
  624. thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
  625. thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
  626. thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
  627. CUDA_CHECK_ALL_ERRORS();
  628. // Reduce: get Max over x and check for res < Precision
  629. maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
  630. CUDA_CHECK_ALL_ERRORS();
  631. } while (false);
  632. // All code related to freeing memory and clearing up the device
  633. cleanup:
  634. if (device_x != nullptr) {
  635. cudaError_t cudaFreeResult = cudaFree(device_x);
  636. if (cudaFreeResult != cudaSuccess) {
  637. std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
  638. }
  639. device_x = nullptr;
  640. }
  641. if (device_y != nullptr) {
  642. cudaError_t cudaFreeResult = cudaFree(device_y);
  643. if (cudaFreeResult != cudaSuccess) {
  644. std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
  645. }
  646. device_y = nullptr;
  647. }
  648. }
  649. /*
  650. * Declare and implement all exported functions for these Kernels here
  651. *
  652. */
  653. void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
  654. basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
  655. }
  656. void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
  657. basicValueIteration_addVectorsInplace<double>(a, b);
  658. }
  659. void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
  660. basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
  661. }
  662. void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
  663. basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
  664. }
  665. void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
  666. basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
  667. }
  668. void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
  669. basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
  670. }
  671. // Float
  672. void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
  673. basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
  674. }
  675. void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
  676. basicValueIteration_addVectorsInplace<float>(a, b);
  677. }
  678. void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
  679. basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
  680. }
  681. void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
  682. basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
  683. }
  684. void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
  685. basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
  686. }
  687. void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
  688. basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
  689. }
  690. bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
  691. if (relativePrecisionCheck) {
  692. return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  693. } else {
  694. return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  695. }
  696. }
  697. bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
  698. if (relativePrecisionCheck) {
  699. return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  700. } else {
  701. return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  702. }
  703. }
  704. bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
  705. if (relativePrecisionCheck) {
  706. return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  707. } else {
  708. return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  709. }
  710. }
  711. bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
  712. if (relativePrecisionCheck) {
  713. return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  714. } else {
  715. return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
  716. }
  717. }
  718. size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
  719. size_t const valueTypeSize = sizeof(double);
  720. size_t const indexTypeSize = sizeof(uint_fast64_t);
  721. /*
  722. IndexType* device_matrixRowIndices = nullptr;
  723. IndexType* device_matrixColIndices = nullptr;
  724. ValueType* device_matrixValues = nullptr;
  725. ValueType* device_x = nullptr;
  726. ValueType* device_xSwap = nullptr;
  727. ValueType* device_b = nullptr;
  728. ValueType* device_multiplyResult = nullptr;
  729. IndexType* device_nondeterministicChoiceIndices = nullptr;
  730. */
  731. // Row Indices, Column Indices, Values, Choice Indices
  732. size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
  733. // Vectors x, xSwap, b, multiplyResult
  734. size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
  735. return (matrixDataSize + vectorSizes);
  736. }
  737. size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
  738. size_t const valueTypeSize = sizeof(float);
  739. size_t const indexTypeSize = sizeof(uint_fast64_t);
  740. /*
  741. IndexType* device_matrixRowIndices = nullptr;
  742. IndexType* device_matrixColIndices = nullptr;
  743. ValueType* device_matrixValues = nullptr;
  744. ValueType* device_x = nullptr;
  745. ValueType* device_xSwap = nullptr;
  746. ValueType* device_b = nullptr;
  747. ValueType* device_multiplyResult = nullptr;
  748. IndexType* device_nondeterministicChoiceIndices = nullptr;
  749. */
  750. // Row Indices, Column Indices, Values, Choice Indices
  751. size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
  752. // Vectors x, xSwap, b, multiplyResult
  753. size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
  754. return (matrixDataSize + vectorSizes);
  755. }