Browse Source

Fixed a bug in the equalModuloPrecision function of the CUDA Kernel

Added more debug output to the CUDA handler functions
Added a function for grouping of SCCs for better performance
Added functionality and accessors to the SparseMatrix


Former-commit-id: 770aec1b09
tempestpy_adaptions
PBerger 11 years ago
parent
commit
05814f5d73
  1. 62
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  2. 1
      resources/cudaForStorm/srcCuda/basicValueIteration.h
  3. 259
      src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
  4. 12
      src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
  5. 9
      src/storage/SparseMatrix.cpp
  6. 7
      src/storage/SparseMatrix.h

62
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -35,7 +35,7 @@ __host__ __device__ T operator()(const T &x, const T &y) const
{ {
if (Relative) { if (Relative) {
if (y == 0) { if (y == 0) {
return x;
return ((x >= 0) ? (x) : (-x));
} }
const T result = (x - y) / y; const T result = (x - y) / y;
return ((result >= 0) ? (result) : (-result)); return ((result >= 0) ? (result) : (-result));
@ -73,6 +73,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
#ifdef DEBUG #ifdef DEBUG
std::cout.sync_with_stdio(true); std::cout.sync_with_stdio(true);
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
@ -143,6 +144,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished allocating memory." << std::endl;
#endif
// Memory allocated, copy data to device // Memory allocated, copy data to device
cudaError_t cudaCopyResult; cudaError_t cudaCopyResult;
@ -204,6 +209,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
#endif
// Data is on device, start Kernel // Data is on device, start Kernel
while (!converged && iterationCount < maxIterationCount) while (!converged && iterationCount < maxIterationCount)
{ // In a sub-area since transfer of control via label evades initialization { // In a sub-area since transfer of control via label evades initialization
@ -239,6 +248,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
std::swap(device_x, device_xSwap); std::swap(device_x, device_xSwap);
} }
#ifdef DEBUG #ifdef DEBUG
std::cout << "(DLL) Finished kernel execution." << std::endl;
std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
#endif #endif
@ -249,6 +259,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished copying result data." << std::endl;
#endif
// All code related to freeing memory and clearing up the device // All code related to freeing memory and clearing up the device
cleanup: cleanup:
if (device_matrixRowIndices != nullptr) { if (device_matrixRowIndices != nullptr) {
@ -307,6 +321,9 @@ cleanup:
} }
device_nondeterministicChoiceIndices = nullptr; device_nondeterministicChoiceIndices = nullptr;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished cleanup." << std::endl;
#endif
} }
template <typename IndexType, typename ValueType> template <typename IndexType, typename ValueType>
@ -323,6 +340,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
#ifdef DEBUG #ifdef DEBUG
std::cout.sync_with_stdio(true); std::cout.sync_with_stdio(true);
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
@ -368,6 +386,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished allocating memory." << std::endl;
#endif
// Memory allocated, copy data to device // Memory allocated, copy data to device
cudaError_t cudaCopyResult; cudaError_t cudaCopyResult;
@ -407,9 +429,17 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
#endif
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS(); CUDA_CHECK_ALL_ERRORS();
#ifdef DEBUG
std::cout << "(DLL) Finished kernel execution." << std::endl;
#endif
// Get result back from the device // Get result back from the device
cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
if (cudaCopyResult != cudaSuccess) { if (cudaCopyResult != cudaSuccess) {
@ -417,6 +447,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup; goto cleanup;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished copying result data." << std::endl;
#endif
// All code related to freeing memory and clearing up the device // All code related to freeing memory and clearing up the device
cleanup: cleanup:
if (device_matrixRowIndices != nullptr) { if (device_matrixRowIndices != nullptr) {
@ -454,6 +488,9 @@ cleanup:
} }
device_multiplyResult = nullptr; device_multiplyResult = nullptr;
} }
#ifdef DEBUG
std::cout << "(DLL) Finished cleanup." << std::endl;
#endif
} }
template <typename ValueType> template <typename ValueType>
@ -730,4 +767,27 @@ void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max
} else { } else {
basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
} }
}
size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
size_t const valueTypeSize = sizeof(double);
size_t const indexTypeSize = sizeof(uint_fast64_t);
/*
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr;
ValueType* device_multiplyResult = nullptr;
IndexType* device_nondeterministicChoiceIndices = nullptr;
*/
// Row Indices, Column Indices, Values, Choice Indices
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
// Vectors x, xSwap, b, multiplyResult
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
return (matrixDataSize + vectorSizes);
} }

1
resources/cudaForStorm/srcCuda/basicValueIteration.h

@ -8,6 +8,7 @@
// Library exports // Library exports
#include "cudaForStorm_Export.h" #include "cudaForStorm_Export.h"
cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);

259
src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

@ -42,12 +42,12 @@ namespace storm {
} }
template<typename ValueType> template<typename ValueType>
void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
// Now, we need to determine the SCCs of the MDP and a topological sort. // Now, we need to determine the SCCs of the MDP and a topological sort.
//std::vector<std::vector<uint_fast64_t>> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); //std::vector<std::vector<uint_fast64_t>> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph);
//storm::storage::SparseMatrix<T> stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); //storm::storage::SparseMatrix<T> stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents);
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices); storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
//storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(*static_cast<storm::models::AbstractPseudoModel<ValueType>*>(&pseudoModel), false, false); //storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(*static_cast<storm::models::AbstractPseudoModel<ValueType>*>(&pseudoModel), false, false);
storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false); storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
@ -60,6 +60,9 @@ namespace storm {
storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
// Calculate the optimal distribution of sccs
std::vector<std::pair<bool, std::vector<uint_fast64_t>>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
// Set up the environment for the power method. // Set up the environment for the power method.
// bool multiplyResultMemoryProvided = true; // bool multiplyResultMemoryProvided = true;
// if (multiplyResult == nullptr) { // if (multiplyResult == nullptr) {
@ -82,12 +85,13 @@ namespace storm {
// solved after all SCCs it depends on have been solved. // solved after all SCCs it depends on have been solved.
int counter = 0; int counter = 0;
for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
bool const useGpu = sccIndexIt->first;
std::vector <uint_fast64_t> const& scc = sccIndexIt->second;
// Generate a submatrix // Generate a submatrix
storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices);
storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount()); std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount()); std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
@ -125,108 +129,115 @@ namespace storm {
} }
// For the current SCC, we need to perform value iteration until convergence. // For the current SCC, we need to perform value iteration until convergence.
if (useGpu) {
#ifdef STORM_HAVE_CUDAFORSTORM #ifdef STORM_HAVE_CUDAFORSTORM
if (!resetCudaDevice()) {
LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
}
LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
std::vector<ValueType> copyX(*currentX);
if (minimize) {
basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
}
else {
basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
}
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
if (!resetCudaDevice()) {
LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
}
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
// Reduce the vector x' by applying min/max for all non-deterministic choices.
std::vector<ValueType> copyX(*currentX);
if (minimize) { if (minimize) {
storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
} }
else { else {
storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
} }
converged = true;
// DEBUG
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
if (minimize) {
storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
}
else {
storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
}
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
// Update environment variables.
std::swap(currentX, swap);
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
++localIterations;
++globalIterations;
}
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
// Update environment variables.
std::swap(currentX, swap);
uint_fast64_t diffCount = 0;
for (size_t i = 0; i < currentX->size(); ++i) {
if (currentX->at(i) != copyX.at(i)) {
LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i));
std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl;
}
}
#else
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
if (minimize) {
storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
++localIterations;
++globalIterations;
} }
else {
storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
uint_fast64_t diffCount = 0;
for (size_t i = 0; i < currentX->size(); ++i) {
if (currentX->at(i) != copyX.at(i)) {
LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i));
std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl;
++diffCount;
}
} }
std::cout << "CUDA solution differed in " << diffCount << " of " << currentX->size() << " values." << std::endl;
#endif
} else {
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
if (minimize) {
storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
}
else {
storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
}
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
// Update environment variables.
std::swap(currentX, swap);
// Update environment variables.
std::swap(currentX, swap);
++localIterations;
++globalIterations;
++localIterations;
++globalIterations;
}
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
} }
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
#endif
// The Result of this SCC has to be taken back into the main result vector // The Result of this SCC has to be taken back into the main result vector
innerIndex = 0; innerIndex = 0;
@ -263,6 +274,72 @@ namespace storm {
} }
} }
template<typename ValueType>
std::vector<std::pair<bool, std::vector<uint_fast64_t>>>
TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
std::vector<std::pair<bool, std::vector<uint_fast64_t>>> result;
#ifdef STORM_HAVE_CUDAFORSTORM
// 95% to have a bit of padding
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
size_t lastResultIndex = 0;
std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
size_t currentSize = 0;
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
uint_fast64_t rowCount = 0;
uint_fast64_t entryCount = 0;
std::vector<uint_fast64_t> rowGroups;
rowGroups.reserve(scc.size());
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowCount += matrix.getRowGroupSize(*sccIt);
entryCount += matrix.getRowGroupEntryCount(*sccIt);
rowGroups.push_back(*sccIt);
}
size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
if ((currentSize + sccSize) <= cudaFreeMemory) {
// There is enough space left in the current group
if (currentSize == 0) {
result.push_back(std::make_pair(true, rowGroups));
}
else {
result[lastResultIndex].second.insert(result[lastResultIndex].second.end(), rowGroups.begin(), rowGroups.end());
}
currentSize += sccSize;
}
else {
if (sccSize <= cudaFreeMemory) {
++lastResultIndex;
result.push_back(std::make_pair(true, rowGroups));
currentSize = sccSize;
}
else {
// This group is too big to fit into the CUDA Memory by itself
lastResultIndex += 2;
result.push_back(std::make_pair(false, rowGroups));
currentSize = 0;
}
}
}
#else
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
std::vector<uint_fast64_t> rowGroups;
rowGroups.reserve(scc.size());
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowGroups.push_back(*sccIt);
result.push_back(std::make_pair(false, rowGroups));
}
}
#endif
return result;
}
// Explicitly instantiate the solver. // Explicitly instantiate the solver.
template class TopologicalValueIterationNondeterministicLinearEquationSolver<double>; template class TopologicalValueIterationNondeterministicLinearEquationSolver<double>;
} // namespace solver } // namespace solver

12
src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h

@ -2,6 +2,11 @@
#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ #define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_
#include "src/solver/NativeNondeterministicLinearEquationSolver.h" #include "src/solver/NativeNondeterministicLinearEquationSolver.h"
#include "src/storage/StronglyConnectedComponentDecomposition.h"
#include "src/storage/SparseMatrix.h"
#include <utility>
#include <vector>
namespace storm { namespace storm {
namespace solver { namespace solver {
@ -30,7 +35,12 @@ namespace storm {
virtual NondeterministicLinearEquationSolver<ValueType>* clone() const override; virtual NondeterministicLinearEquationSolver<ValueType>* clone() const override;
virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
private:
/*!
* Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
*/
std::vector<std::pair<bool, std::vector<uint_fast64_t>>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
}; };
} // namespace solver } // namespace solver
} // namespace storm } // namespace storm

9
src/storage/SparseMatrix.cpp

@ -330,6 +330,15 @@ namespace storm {
return entryCount; return entryCount;
} }
template<typename T>
uint_fast64_t SparseMatrix<T>::getRowGroupEntryCount(uint_fast64_t const group) const {
uint_fast64_t result = 0;
for (uint_fast64_t row = this->getRowGroupIndices()[group]; row < this->getRowGroupIndices()[group + 1]; ++row) {
result += (this->rowIndications[row + 1] - this->rowIndications[row]);
}
return result;
}
template<typename T> template<typename T>
uint_fast64_t SparseMatrix<T>::getRowGroupCount() const { uint_fast64_t SparseMatrix<T>::getRowGroupCount() const {
return rowGroupIndices.size() - 1; return rowGroupIndices.size() - 1;

7
src/storage/SparseMatrix.h

@ -344,6 +344,13 @@ namespace storm {
* @return The number of entries in the matrix. * @return The number of entries in the matrix.
*/ */
uint_fast64_t getEntryCount() const; uint_fast64_t getEntryCount() const;
/*!
* Returns the number of entries in the given row group of the matrix.
*
* @return The number of entries in the given row group of the matrix.
*/
uint_fast64_t getRowGroupEntryCount(uint_fast64_t const group) const;
/*! /*!
* Returns the number of row groups in the matrix. * Returns the number of row groups in the matrix.

Loading…
Cancel
Save