The source code and dockerfile for the GSW2024 AI Lab.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

360 lines
21 KiB

4 weeks ago
  1. /*
  2. * This is an extension of the original CUSP csr_vector.h SPMV implementation.
  3. * It is based on the Code and incorporates changes as to cope with the details
  4. * of the StoRM code.
  5. * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm
  6. */
  7. /*
  8. * Copyright 2008-2009 NVIDIA Corporation
  9. *
  10. * Licensed under the Apache License, Version 2.0 (the "License");
  11. * you may not use this file except in compliance with the License.
  12. * You may obtain a copy of the License at
  13. *
  14. * http://www.apache.org/licenses/LICENSE-2.0
  15. *
  16. * Unless required by applicable law or agreed to in writing, software
  17. * distributed under the License is distributed on an "AS IS" BASIS,
  18. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. * See the License for the specific language governing permissions and
  20. * limitations under the License.
  21. */
  22. #pragma once
  23. #include <limits>
  24. #include <cstdint>
  25. #include <algorithm>
  26. #include <math_functions.h>
  27. #include <cusp/detail/device/spmv/csr_vector.h>
  28. namespace cusp
  29. {
  30. namespace detail
  31. {
  32. namespace device
  33. {
  34. //////////////////////////////////////////////////////////////////////////////
  35. // CSR SpMV kernels based on a vector model (one warp per row)
  36. //////////////////////////////////////////////////////////////////////////////
  37. //
  38. // spmv_csr_vector_device
  39. // Each row of the CSR matrix is assigned to a warp. The warp computes
  40. // y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
  41. // the x vector, in parallel. This division of work implies that
  42. // the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
  43. // manner (but generally not aligned). On GT200 these accesses are
  44. // coalesced, unlike kernels based on the one-row-per-thread division of
  45. // work. Since an entire 32-thread warp is assigned to each row, many
  46. // threads will remain idle when their row contains a small number
  47. // of elements. This code relies on implicit synchronization among
  48. // threads in a warp.
  49. //
  50. // spmv_csr_vector_tex_device
  51. // Same as spmv_csr_vector_tex_device, except that the texture cache is
  52. // used for accessing the x vector.
  53. //
  54. // Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
  55. template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
  56. __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
  57. __global__ void
  58. storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
  59. {
  60. __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
  61. __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
  62. const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
  63. const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
  64. const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
  65. const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
  66. const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
  67. const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
  68. for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
  69. {
  70. // use two threads to fetch Ap[row] and Ap[row+1]
  71. // this is considerably faster than the straightforward version
  72. if(thread_lane < 2)
  73. ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
  74. const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
  75. const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
  76. // initialize local sum
  77. double sum = 0;
  78. if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
  79. {
  80. // ensure aligned memory access to Aj and Ax
  81. uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
  82. // accumulate local sums
  83. if(jj >= row_start && jj < row_end) {
  84. sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
  85. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  86. }
  87. // accumulate local sums
  88. for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
  89. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  90. sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
  91. }
  92. } else {
  93. // accumulate local sums
  94. for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
  95. //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
  96. sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
  97. }
  98. }
  99. // store local sum in shared memory
  100. sdata[threadIdx.x] = sum;
  101. // reduce local sums to row sum
  102. if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
  103. if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
  104. if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
  105. if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
  106. if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
  107. // first thread writes the result
  108. if (thread_lane == 0)
  109. y[row] = sdata[threadIdx.x];
  110. }
  111. }
  112. template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
  113. __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
  114. __global__ void
  115. storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
  116. {
  117. __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
  118. __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
  119. const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
  120. const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
  121. const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
  122. const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index
  123. const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
  124. const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
  125. for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
  126. {
  127. // use two threads to fetch Ap[row] and Ap[row+1]
  128. // this is considerably faster than the straightforward version
  129. if(thread_lane < 2)
  130. ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
  131. const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
  132. const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
  133. // initialize local Min/Max
  134. double localMinMaxElement = minMaxInitializer;
  135. if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
  136. {
  137. // ensure aligned memory access to Aj and Ax
  138. uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
  139. // accumulate local sums
  140. if(jj >= row_start && jj < row_end) {
  141. if(Minimize) {
  142. localMinMaxElement = min(localMinMaxElement, y[jj]);
  143. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  144. } else {
  145. localMinMaxElement = max(localMinMaxElement, y[jj]);
  146. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  147. }
  148. }
  149. // accumulate local sums
  150. for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
  151. if(Minimize) {
  152. localMinMaxElement = min(localMinMaxElement, y[jj]);
  153. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  154. } else {
  155. localMinMaxElement = max(localMinMaxElement, y[jj]);
  156. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  157. }
  158. }
  159. else
  160. {
  161. // accumulate local sums
  162. for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
  163. if(Minimize) {
  164. localMinMaxElement = min(localMinMaxElement, y[jj]);
  165. //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
  166. } else {
  167. localMinMaxElement = max(localMinMaxElement, y[jj]);
  168. //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
  169. }
  170. }
  171. // store local sum in shared memory
  172. sdata[threadIdx.x] = localMinMaxElement;
  173. // reduce local min/max to row min/max
  174. if (Minimize) {
  175. /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
  176. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
  177. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
  178. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
  179. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
  180. if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
  181. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]);
  182. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]);
  183. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]);
  184. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]);
  185. } else {
  186. /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
  187. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
  188. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
  189. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
  190. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
  191. if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
  192. if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
  193. if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
  194. if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
  195. if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
  196. }
  197. // first thread writes the result
  198. if (thread_lane == 0)
  199. x[row] = sdata[threadIdx.x];
  200. }
  201. }
  202. template <bool Minimize, unsigned int THREADS_PER_VECTOR>
  203. void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
  204. {
  205. double __minMaxInitializer = -std::numeric_limits<double>::max();
  206. if (Minimize) {
  207. __minMaxInitializer = std::numeric_limits<double>::max();
  208. }
  209. const double minMaxInitializer = __minMaxInitializer;
  210. const size_t THREADS_PER_BLOCK = 128;
  211. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  212. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
  213. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  214. storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  215. (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
  216. }
  217. template <bool Minimize>
  218. void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
  219. {
  220. const uint_fast64_t rows_per_group = num_entries / num_rows;
  221. if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  222. if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  223. if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  224. if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
  225. __storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
  226. }
  227. template <bool UseCache, unsigned int THREADS_PER_VECTOR>
  228. void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
  229. {
  230. const size_t THREADS_PER_BLOCK = 128;
  231. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  232. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
  233. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  234. if (UseCache)
  235. bind_x(x);
  236. storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  237. (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  238. if (UseCache)
  239. unbind_x(x);
  240. }
  241. void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
  242. {
  243. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  244. if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  245. if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  246. if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  247. if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  248. __storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  249. }
  250. void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
  251. {
  252. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  253. if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  254. if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  255. if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  256. if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
  257. __storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
  258. }
  259. // NON-OPT
  260. template <bool UseCache, unsigned int THREADS_PER_VECTOR>
  261. void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
  262. {
  263. const size_t THREADS_PER_BLOCK = 128;
  264. const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
  265. const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
  266. const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
  267. if (UseCache)
  268. bind_x(x);
  269. spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
  270. (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  271. if (UseCache)
  272. unbind_x(x);
  273. }
  274. void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
  275. {
  276. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  277. if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  278. if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  279. if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  280. if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  281. __storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  282. }
  283. void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
  284. {
  285. const uint_fast64_t nnz_per_row = num_entries / num_rows;
  286. if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  287. if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  288. if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  289. if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
  290. __storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
  291. }
  292. } // end namespace device
  293. } // end namespace detail
  294. } // end namespace cusp