The source code and dockerfile for the GSW2024 AI Lab.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

374 lines
21 KiB

4 weeks ago
  1. /*
  2. * This is an extension of the original CUSP csr_vector.h SPMV implementation.
  3. * It is based on the Code and incorporates changes as to cope with the details
  4. * of the StoRM code.
  5. * As this is mostly copy & paste, the original license still applies.
  6. */
  7. /*
  8. * Copyright 2008-2009 NVIDIA Corporation
  9. *
  10. * Licensed under the Apache License, Version 2.0 (the "License");
  11. * you may not use this file except in compliance with the License.
  12. * You may obtain a copy of the License at
  13. *
  14. * http://www.apache.org/licenses/LICENSE-2.0
  15. *
  16. * Unless required by applicable law or agreed to in writing, software
  17. * distributed under the License is distributed on an "AS IS" BASIS,
  18. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. * See the License for the specific language governing permissions and
  20. * limitations under the License.
  21. */
  22. #pragma once
  23. #include <limits>
  24. #include <cstdint>
  25. #include <algorithm>
  26. #include <math_functions.h>
  27. #include <cusp/detail/device/spmv/csr_vector.h>
  28. #include "storm-cudaplugin-config.h"
  29. namespace cusp
  30. {
  31. namespace detail
  32. {
  33. namespace device
  34. {
  35. //////////////////////////////////////////////////////////////////////////////
  36. // CSR SpMV kernels based on a vector model (one warp per row)
  37. //////////////////////////////////////////////////////////////////////////////
  38. //
  39. // spmv_csr_vector_device
  40. // Each row of the CSR matrix is assigned to a warp. The warp computes
  41. // y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
  42. // the x vector, in parallel. This division of work implies that
  43. // the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
  44. // manner (but generally not aligned). On GT200 these accesses are
  45. // coalesced, unlike kernels based on the one-row-per-thread division of
  46. // work. Since an entire 32-thread warp is assigned to each row, many
  47. // threads will remain idle when their row contains a small number
  48. // of elements. This code relies on implicit synchronization among
  49. // threads in a warp.
  50. //
  51. // spmv_csr_vector_tex_device
  52. // Same as spmv_csr_vector_tex_device, except that the texture cache is
  53. // used for accessing the x vector.
  54. //
  55. // Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
  56. template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
  57. __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
  58. __global__ void
  59. storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
  60. {
  61. __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
  62. __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
  63. const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
  64. const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
  65. const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
  66. const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
  67. const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
  68. const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
  69. for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
  70. {
  71. // use two threads to fetch Ap[row] and Ap[row+1]
  72. // this is considerably faster than the straightforward version
  73. if(thread_lane < 2)
  74. ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
  75. const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
  76. const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
  77. // initialize local sum
  78. float sum = 0;
  79. if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
  80. {
  81. // ensure aligned memory access to Aj and Ax
  82. uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
  83. // accumulate local sums
  84. if(jj >= row_start && jj < row_end) {
  85. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  86. sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
  87. #else
  88. sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
  89. #endif
  90. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  91. }
  92. // accumulate local sums
  93. for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
  94. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  95. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  96. sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
  97. #else
  98. sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
  99. #endif
  100. }
  101. } else {
  102. // accumulate local sums
  103. for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
  104. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  105. #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
  106. sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
  107. #else
  108. sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
  109. #endif
  110. }
  111. }
  112. // store local sum in shared memory
  113. sdata[threadIdx.x] = sum;
  114. // reduce local sums to row sum
  115. if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
  116. if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
  117. if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
  118. if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
  119. if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
  120. // first thread writes the result
  121. if (thread_lane == 0)
  122. y[row] = sdata[threadIdx.x];
  123. }
  124. }
  125. template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
  126. __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
  127. __global__ void
  128. storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
  129. {
  130. __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
  131. __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
  132. const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
  133. const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
  134. const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
  135. const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index
  136. const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
  137. const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
  138. for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
  139. {
  140. // use two threads to fetch Ap[row] and Ap[row+1]
  141. // this is considerably faster than the straightforward version
  142. if(thread_lane < 2)
  143. ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
  144. const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
  145. const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
  146. // initialize local Min/Max
  147. float localMinMaxElement = minMaxInitializer;
  148. if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
  149. {
  150. // ensure aligned memory access to Aj and Ax
  151. uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
  152. // accumulate local sums
  153. if(jj >= row_start && jj < row_end) {
  154. if(Minimize) {
  155. localMinMaxElement = min(localMinMaxElement, y[jj]);
  156. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  157. } else {
  158. localMinMaxElement = max(localMinMaxElement, y[jj]);
  159. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  160. }
  161. }
  162. // accumulate local sums
  163. for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
  164. if(Minimize) {
  165. localMinMaxElement = min(localMinMaxElement, y[jj]);
  166. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  167. } else {
  168. localMinMaxElement = max(localMinMaxElement, y[jj]);
  169. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  170. }
  171. }
  172. else
  173. {
  174. // accumulate local sums
  175. for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
  176. if(Minimize) {
  177. localMinMaxElement = min(localMinMaxElement, y[jj]);
  178. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  179. } else {
  180. localMinMaxElement = max(localMinMaxElement, y[jj]);
  181. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  182. }
  183. }
  184. // store local sum in shared memory
  185. sdata[threadIdx.x] = localMinMaxElement;
  186. // reduce local min/max to row min/max
  187. if (Minimize) {
  188. /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
  189. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
  190. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
  191. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
  192. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
  193. if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
  194. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]);
  195. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]);
  196. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]);
  197. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]);
  198. } else {
  199. /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
  200. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
  201. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
  202. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
  203. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
  204. if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
  205. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
  206. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
  207. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
  208. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
  209. }
  210. // first thread writes the result
  211. if (thread_lane == 0)
  212. x[row] = sdata[threadIdx.x];
  213. }
  214. }
  215. template <bool Minimize, unsigned int THREADS_PER_VECTOR>
  216. void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
  217. {
  218. float __minMaxInitializer = -std::numeric_limits<float>::max();
  219. if (Minimize) {
  220. __minMaxInitializer = std::numeric_limits<float>::max();
  221. }
  222. const float minMaxInitializer = __minMaxInitializer;
  223. const size_t THREADS_PER_BLOCK = 128;
  224. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  225. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
  226. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  227. storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  228. (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
  229. }
  230. template <bool Minimize>
  231. void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
  232. {
  233. const uint_fast64_t rows_per_group = num_entries / num_rows;
  234. if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  235. if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  236. if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  237. if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  238. __storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
  239. }
  240. template <bool UseCache, unsigned int THREADS_PER_VECTOR>
  241. void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
  242. {
  243. const size_t THREADS_PER_BLOCK = 128;
  244. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  245. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
  246. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  247. if (UseCache)
  248. bind_x(x);
  249. storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  250. (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  251. if (UseCache)
  252. unbind_x(x);
  253. }
  254. void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
  255. {
  256. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  257. if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  258. if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  259. if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  260. if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  261. __storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  262. }
  263. void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
  264. {
  265. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  266. if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  267. if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  268. if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  269. if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  270. __storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  271. }
  272. // NON-OPT
  273. template <bool UseCache, unsigned int THREADS_PER_VECTOR>
  274. void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
  275. {
  276. const size_t THREADS_PER_BLOCK = 128;
  277. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  278. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
  279. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  280. if (UseCache)
  281. bind_x(x);
  282. spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  283. (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  284. if (UseCache)
  285. unbind_x(x);
  286. }
  287. void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
  288. {
  289. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  290. if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  291. if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  292. if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  293. if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  294. __storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  295. }
  296. void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
  297. {
  298. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  299. if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  300. if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  301. if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  302. if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  303. __storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  304. }
  305. } // end namespace device
  306. } // end namespace detail
  307. } // end namespace cusp