Browse Source

Refactored the CUDA Kernel to once again use the "hacked" combination of column indices and values with a bit of reinterpret_cast magic.

Refactored the CUDA-SCC grouping algorithm as is took 80x longer to calculate the groups than it took to calculate the entire solution.


Former-commit-id: 5a5ffabe38
tempestpy_adaptions
PBerger 11 years ago
parent
commit
26500ff4a8
  1. 94
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  2. 36
      resources/cudaForStorm/srcCuda/cuspExtension.h
  3. 85
      src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

94
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -58,13 +58,12 @@ void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVect
template <bool Minimize, bool Relative, typename IndexType, typename ValueType> template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) { void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
std::vector<IndexType> matrixColumnIndices;
std::vector<ValueType> matrixValues;
exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
//std::vector<IndexType> matrixColumnIndices;
//std::vector<ValueType> matrixValues;
//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr; ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr; ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr; ValueType* device_b = nullptr;
@ -96,16 +95,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
} }
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) { if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup; goto cleanup;
} }
@ -159,16 +151,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
} }
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) { if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup; goto cleanup;
} }
@ -216,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
// Data is on device, start Kernel // Data is on device, start Kernel
while (!converged && iterationCount < maxIterationCount) while (!converged && iterationCount < maxIterationCount)
{ // In a sub-area since transfer of control via label evades initialization { // In a sub-area since transfer of control via label evades initialization
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b); thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -272,19 +257,12 @@ cleanup:
} }
device_matrixRowIndices = nullptr; device_matrixRowIndices = nullptr;
} }
if (device_matrixColIndices != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
if (device_matrixColIndicesAndValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
if (cudaFreeResult != cudaSuccess) { if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
} }
device_matrixColIndices = nullptr;
}
if (device_matrixValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixValues = nullptr;
device_matrixColIndicesAndValues = nullptr;
} }
if (device_x != nullptr) { if (device_x != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_x); cudaError_t cudaFreeResult = cudaFree(device_x);
@ -328,13 +306,8 @@ cleanup:
template <typename IndexType, typename ValueType> template <typename IndexType, typename ValueType>
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) { void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
std::vector<IndexType> matrixColumnIndices;
std::vector<ValueType> matrixValues;
exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr; ValueType* device_x = nullptr;
ValueType* device_multiplyResult = nullptr; ValueType* device_multiplyResult = nullptr;
@ -359,16 +332,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
} }
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) { if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup; goto cleanup;
} }
@ -401,16 +367,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
} }
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) { if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup; goto cleanup;
} }
@ -433,7 +392,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
#endif #endif
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
#ifdef DEBUG #ifdef DEBUG
@ -460,19 +419,12 @@ cleanup:
} }
device_matrixRowIndices = nullptr; device_matrixRowIndices = nullptr;
} }
if (device_matrixColIndices != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixColIndices = nullptr;
}
if (device_matrixValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
if (device_matrixColIndicesAndValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
if (cudaFreeResult != cudaSuccess) { if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
} }
device_matrixValues = nullptr;
device_matrixColIndicesAndValues = nullptr;
} }
if (device_x != nullptr) { if (device_x != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_x); cudaError_t cudaFreeResult = cudaFree(device_x);

36
resources/cudaForStorm/srcCuda/cuspExtension.h

@ -61,7 +61,7 @@ namespace device
template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache> template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void __global__ void
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y)
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
{ {
__shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType
// accumulate local sums // accumulate local sums
if(jj >= row_start && jj < row_end) if(jj >= row_start && jj < row_end)
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
// accumulate local sums // accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
} }
else else
{ {
// accumulate local sums // accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
} }
// store local sum in shared memory // store local sum in shared memory
@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_
} }
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType> template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{ {
const size_t THREADS_PER_BLOCK = 128; const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType
bind_x(x); bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
if (UseCache) if (UseCache)
unbind_x(x); unbind_x(x);
} }
template <typename IndexType, typename ValueType> template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{ {
const IndexType nnz_per_row = num_entries / num_rows; const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
__storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
} }
template <typename IndexType, typename ValueType> template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{ {
const IndexType nnz_per_row = num_entries / num_rows; const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
__storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
} }
// NON-OPT // NON-OPT

85
src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

@ -1,7 +1,6 @@
#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"
#include <utility> #include <utility>
#include <set>
#include "src/settings/Settings.h" #include "src/settings/Settings.h"
#include "src/utility/vector.h" #include "src/utility/vector.h"
@ -46,7 +45,6 @@ namespace storm {
template<typename ValueType> template<typename ValueType>
void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const { void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
// Now, we need to determine the SCCs of the MDP and perform a topological sort. // Now, we need to determine the SCCs of the MDP and perform a topological sort.
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices(); std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices); storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
@ -219,57 +217,94 @@ namespace storm {
size_t lastResultIndex = 0; size_t lastResultIndex = 0;
std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices(); std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount()));
size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1));
size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup;
size_t currentSize = 0; size_t currentSize = 0;
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
size_t neededReserveSize = 0;
size_t startIndex = 0;
for (size_t i = 0; i < topologicalSort.size(); ++i) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]];
size_t const currentSccSize = scc.size();
uint_fast64_t rowCount = 0; uint_fast64_t rowCount = 0;
uint_fast64_t entryCount = 0; uint_fast64_t entryCount = 0;
storm::storage::StateBlock rowGroups;
rowGroups.reserve(scc.size());
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowCount += matrix.getRowGroupSize(*sccIt); rowCount += matrix.getRowGroupSize(*sccIt);
entryCount += matrix.getRowGroupEntryCount(*sccIt); entryCount += matrix.getRowGroupEntryCount(*sccIt);
rowGroups.insert(*sccIt);
} }
size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount)); size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
if ((currentSize + sccSize) <= cudaFreeMemory) { if ((currentSize + sccSize) <= cudaFreeMemory) {
// There is enough space left in the current group // There is enough space left in the current group
neededReserveSize += currentSccSize;
currentSize += sccSize;
} else {
// This would make the last open group to big for the GPU
if (currentSize == 0) {
result.push_back(std::make_pair(true, rowGroups));
if (startIndex < i) {
if ((startIndex + 1) < i) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
for (size_t j = startIndex; j < i; ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
} }
else {
result[lastResultIndex].second.insert(rowGroups.begin(), rowGroups.end());
std::sort(tempGroups.begin(), tempGroups.end());
result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
} else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
} }
currentSize += sccSize;
++lastResultIndex;
} }
else {
if (sccSize <= cudaFreeMemory) { if (sccSize <= cudaFreeMemory) {
++lastResultIndex;
result.push_back(std::make_pair(true, rowGroups));
currentSize = sccSize; currentSize = sccSize;
}
else {
neededReserveSize = currentSccSize;
startIndex = i;
} else {
// This group is too big to fit into the CUDA Memory by itself // This group is too big to fit into the CUDA Memory by itself
lastResultIndex += 2;
result.push_back(std::make_pair(false, rowGroups));
result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]]))));
++lastResultIndex;
currentSize = 0; currentSize = 0;
neededReserveSize = 0;
startIndex = i + 1;
}
} }
} }
size_t const topologicalSortSize = topologicalSort.size();
if (startIndex < topologicalSortSize) {
if ((startIndex + 1) < topologicalSortSize) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
for (size_t j = startIndex; j < topologicalSortSize; ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
}
std::sort(tempGroups.begin(), tempGroups.end());
result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
}
else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
}
++lastResultIndex;
} }
#else #else
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
storm::storage::StateBlock rowGroups;
rowGroups.reserve(scc.size());
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowGroups.insert(*sccIt);
}
result.push_back(std::make_pair(false, rowGroups));
result.push_back(std::make_pair(false, scc));
} }
#endif #endif
return result; return result;

Loading…
Cancel
Save