Browse Source

Renamed TopologicalMinMaxLinearEquationSolver -> TopologicalCudaMinMaxLinearEquationSolver

tempestpy_adaptions
TimQu 7 years ago
parent
commit
4ab47671f5
  1. 5
      src/storm/solver/MinMaxLinearEquationSolver.cpp
  2. 2
      src/storm/solver/SolverSelectionOptions.cpp
  3. 2
      src/storm/solver/SolverSelectionOptions.h
  4. 485
      src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.cpp
  5. 154
      src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h
  6. 485
      src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp
  7. 154
      src/storm/solver/TopologicalMinMaxLinearEquationSolver.h
  8. 12
      src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp

5
src/storm/solver/MinMaxLinearEquationSolver.cpp

@ -5,6 +5,7 @@
#include "storm/solver/LinearEquationSolver.h"
#include "storm/solver/IterativeMinMaxLinearEquationSolver.h"
#include "storm/solver/TopologicalMinMaxLinearEquationSolver.h"
#include "storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h"
#include "storm/solver/LpMinMaxLinearEquationSolver.h"
#include "storm/environment/solver/MinMaxSolverEnvironment.h"
@ -198,8 +199,8 @@ namespace storm {
auto method = env.solver().minMax().getMethod();
if (method == MinMaxMethod::ValueIteration || method == MinMaxMethod::PolicyIteration || method == MinMaxMethod::RationalSearch || method == MinMaxMethod::QuickValueIteration) {
result = std::make_unique<IterativeMinMaxLinearEquationSolver<ValueType>>(std::make_unique<GeneralLinearEquationSolverFactory<ValueType>>());
} else if (method == MinMaxMethod::Topological) {
result = std::make_unique<TopologicalMinMaxLinearEquationSolver<ValueType>>();
} else if (method == MinMaxMethod::TopologicalCuda) {
result = std::make_unique<TopologicalCudaMinMaxLinearEquationSolver<ValueType>>();
} else if (method == MinMaxMethod::LinearProgramming) {
result = std::make_unique<LpMinMaxLinearEquationSolver<ValueType>>(std::make_unique<GeneralLinearEquationSolverFactory<ValueType>>(), std::make_unique<storm::utility::solver::LpSolverFactory<ValueType>>());
} else {

2
src/storm/solver/SolverSelectionOptions.cpp

@ -16,6 +16,8 @@ namespace storm {
return "ratsearch";
case MinMaxMethod::QuickValueIteration:
return "QuickValueIteration";
case MinMaxMethod::TopologicalCuda:
return "topologicalcuda";
}
return "invalid";
}

2
src/storm/solver/SolverSelectionOptions.h

@ -6,7 +6,7 @@
namespace storm {
namespace solver {
ExtendEnumsWithSelectionField(MinMaxMethod, PolicyIteration, ValueIteration, LinearProgramming, Topological, RationalSearch, QuickValueIteration)
ExtendEnumsWithSelectionField(MinMaxMethod, PolicyIteration, ValueIteration, LinearProgramming, Topological, RationalSearch, QuickValueIteration, TopologicalCuda)
ExtendEnumsWithSelectionField(GameMethod, PolicyIteration, ValueIteration)
ExtendEnumsWithSelectionField(LraMethod, LinearProgramming, ValueIteration)

485
src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.cpp

@ -0,0 +1,485 @@
#include "storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h"
#include "storm/utility/vector.h"
#include "storm/utility/graph.h"
#include "storm/storage/StronglyConnectedComponentDecomposition.h"
#include "storm/exceptions/IllegalArgumentException.h"
#include "storm/exceptions/InvalidStateException.h"
#include "storm/exceptions/InvalidEnvironmentException.h"
#include "storm/environment/solver/MinMaxSolverEnvironment.h"
#include "storm/settings/SettingsManager.h"
#include "storm/settings/modules/CoreSettings.h"
#include "storm/utility/macros.h"
#include "storm-config.h"
#ifdef STORM_HAVE_CUDA
# include "cudaForStorm.h"
#endif
namespace storm {
namespace solver {
template<typename ValueType>
TopologicalCudaMinMaxLinearEquationSolver<ValueType>::TopologicalCudaMinMaxLinearEquationSolver() {
// Get the settings object to customize solving.
this->enableCuda = storm::settings::getModule<storm::settings::modules::CoreSettings>().isUseCudaSet();
#ifdef STORM_HAVE_CUDA
STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways.");
#endif
}
template<typename ValueType>
TopologicalCudaMinMaxLinearEquationSolver<ValueType>::TopologicalCudaMinMaxLinearEquationSolver(storm::storage::SparseMatrix<ValueType> const& A) : TopologicalCudaMinMaxLinearEquationSolver() {
this->setMatrix(A);
}
template<typename ValueType>
void TopologicalCudaMinMaxLinearEquationSolver<ValueType>::setMatrix(storm::storage::SparseMatrix<ValueType> const& matrix) {
this->localA = nullptr;
this->A = &matrix;
}
template<typename ValueType>
void TopologicalCudaMinMaxLinearEquationSolver<ValueType>::setMatrix(storm::storage::SparseMatrix<ValueType>&& matrix) {
this->localA = std::make_unique<storm::storage::SparseMatrix<ValueType>>(std::move(matrix));
this->A = this->localA.get();
}
template<typename ValueType>
bool TopologicalCudaMinMaxLinearEquationSolver<ValueType>::internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const& b) const {
STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique.");
ValueType precision = storm::utility::convertNumber<ValueType>(env.solver().minMax().getPrecision());
uint64_t maxIters = env.solver().minMax().getMaximalNumberOfIterations();
bool relative = env.solver().minMax().getMaximalNumberOfIterations();
#ifdef GPU_USE_FLOAT
#define __FORCE_FLOAT_CALCULATION true
#else
#define __FORCE_FLOAT_CALCULATION false
#endif
if (__FORCE_FLOAT_CALCULATION && std::is_same<ValueType, double>::value) {
// FIXME: This actually allocates quite some storage, because of this conversion, is it really necessary?
storm::storage::SparseMatrix<float> newA = this->A->template toValueType<float>();
TopologicalCudaMinMaxLinearEquationSolver<float> newSolver(newA);
std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
bool callConverged = newSolver.solveEquations(env, dir, new_x, new_b);
for (size_t i = 0, size = new_x.size(); i < size; ++i) {
x.at(i) = new_x.at(i);
}
return callConverged;
}
// For testing only
if (sizeof(ValueType) == sizeof(double)) {
//std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
STORM_LOG_INFO("<<< Using CUDA-DOUBLE Kernels >>>");
} else {
//std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
STORM_LOG_INFO("<<< Using CUDA-FLOAT Kernels >>>");
}
// Now, we need to determine the SCCs of the MDP and perform a topological sort.
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = this->A->getRowGroupIndices();
// Check if the decomposition is necessary
#ifdef STORM_HAVE_CUDA
#define __USE_CUDAFORSTORM_OPT true
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A->getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A->getEntryCount()));
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
#else
#define __USE_CUDAFORSTORM_OPT false
size_t const gpuSizeOfCompleteSystem = 0;
size_t const cudaFreeMemory = 0;
#endif
std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition;
if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) {
// Dummy output for SCC Times
//std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
#ifdef STORM_HAVE_CUDA
STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
bool result = false;
size_t globalIterations = 0;
if (dir == OptimizationDirection::Minimize) {
result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
} else {
result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
}
STORM_LOG_INFO("Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
bool converged = false;
if (!result) {
converged = false;
STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue.");
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
} else {
converged = true;
}
// Check if the solver converged and issue a warning otherwise.
if (converged) {
STORM_LOG_INFO("Iterative solver converged after " << globalIterations << " iterations.");
} else {
STORM_LOG_WARN("Iterative solver did not converged after " << globalIterations << " iterations.");
}
#else
STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!");
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!";
#endif
} else {
storm::storage::BitVector fullSystem(this->A->getRowGroupCount(), true);
storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(*this->A, fullSystem, false, false);
STORM_LOG_THROW(sccDecomposition.size() > 0, storm::exceptions::IllegalArgumentException, "Can not solve given equation system as the SCC decomposition returned no SCCs.");
storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = sccDecomposition.extractPartitionDependencyGraph(*this->A);
std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
// Calculate the optimal distribution of sccs
std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, *this->A);
STORM_LOG_INFO("Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
std::vector<ValueType>* currentX = nullptr;
std::vector<ValueType>* swap = nullptr;
size_t currentMaxLocalIterations = 0;
size_t localIterations = 0;
size_t globalIterations = 0;
bool converged = true;
// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
// solved after all SCCs it depends on have been solved.
for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
bool const useGpu = sccIndexIt->first;
storm::storage::StateBlock const& scc = sccIndexIt->second;
// Generate a sub matrix
storm::storage::BitVector subMatrixIndices(this->A->getColumnCount(), scc.cbegin(), scc.cend());
storm::storage::SparseMatrix<ValueType> sccSubmatrix = this->A->getSubmatrix(true, subMatrixIndices, subMatrixIndices);
std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
// Prepare the pointers for swapping in the calculation
currentX = &sccSubX;
swap = &sccSubXSwap;
storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
sccSubNondeterministicChoiceIndices.at(0) = 0;
// Pre-process all dependent states
// Remove outgoing transitions and create the ChoiceIndices
uint_fast64_t innerIndex = 0;
uint_fast64_t outerIndex = 0;
for (uint_fast64_t state : scc) {
// Choice Indices
sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
typename storm::storage::SparseMatrix<ValueType>::const_rows row = this->A->getRow(rowGroupIt);
for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
if (!subMatrixIndices.get(rowIt->getColumn())) {
// This is an outgoing transition of a state in the SCC to a state not included in the SCC
// Subtracting Pr(tau) * x_other from b fixes that
sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
}
}
++innerIndex;
}
++outerIndex;
}
// For the current SCC, we need to perform value iteration until convergence.
if (useGpu) {
#ifdef STORM_HAVE_CUDA
STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA-based equation solver.");
//STORM_LOG_INFO("Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
//STORM_LOG_INFO("We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
//STORM_LOG_INFO("The CUDA Runtime Version is " << getRuntimeCudaVersion());
bool result = false;
localIterations = 0;
if (dir == OptimizationDirection::Minimum) {
result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
} else {
result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
}
STORM_LOG_INFO("Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
if (!result) {
converged = false;
STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue.");
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
} else {
converged = true;
}
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
globalIterations += localIterations;
#else
STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!");
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!";
#endif
} else {
//std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl;
STORM_LOG_INFO("Performance Warning: Using CPU based TopoSolver! (double)");
localIterations = 0;
converged = false;
while (!converged && localIterations < maxIters) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectors<ValueType>(sccMultiplyResult, sccSubB, sccMultiplyResult);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
storm::utility::vector::reduceVectorMinOrMax<ValueType>(dir,sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, precision, relative);
// Update environment variables.
std::swap(currentX, swap);
++localIterations;
++globalIterations;
}
STORM_LOG_INFO("Executed " << localIterations << " of max. " << maxIters << " Iterations.");
}
// The Result of this SCC has to be taken back into the main result vector
innerIndex = 0;
for (uint_fast64_t state : scc) {
x.at(state) = currentX->at(innerIndex);
++innerIndex;
}
// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
currentX = nullptr;
swap = nullptr;
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
}
//std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl;
// Check if the solver converged and issue a warning otherwise.
if (converged) {
STORM_LOG_INFO("Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
} else {
STORM_LOG_WARN("Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
}
return converged;
}
}
template<typename ValueType>
std::vector<std::pair<bool, storm::storage::StateBlock>>
TopologicalCudaMinMaxLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
(void)matrix;
std::vector<std::pair<bool, storm::storage::StateBlock>> result;
#ifdef STORM_HAVE_CUDA
// 95% to have a bit of padding
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
size_t lastResultIndex = 0;
std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount()));
size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1));
size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup;
size_t currentSize = 0;
size_t neededReserveSize = 0;
size_t startIndex = 0;
for (size_t i = 0; i < topologicalSort.size(); ++i) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]];
size_t const currentSccSize = scc.size();
uint_fast64_t rowCount = 0;
uint_fast64_t entryCount = 0;
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowCount += matrix.getRowGroupSize(*sccIt);
entryCount += matrix.getRowGroupEntryCount(*sccIt);
}
size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
if ((currentSize + sccSize) <= cudaFreeMemory) {
// There is enough space left in the current group
neededReserveSize += currentSccSize;
currentSize += sccSize;
} else {
// This would make the last open group to big for the GPU
if (startIndex < i) {
if ((startIndex + 1) < i) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
// Copy the first group to make inplace_merge possible
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
if (((startIndex + 1) + 80) >= i) {
size_t lastSize = 0;
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
lastSize = tempGroups.size();
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
std::advance(middleIterator, lastSize);
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
}
} else {
// Use std::sort
for (size_t j = startIndex + 1; j < i; ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
}
std::sort(tempGroups.begin(), tempGroups.end());
}
result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
} else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
}
++lastResultIndex;
}
if (sccSize <= cudaFreeMemory) {
currentSize = sccSize;
neededReserveSize = currentSccSize;
startIndex = i;
} else {
// This group is too big to fit into the CUDA Memory by itself
result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]]))));
++lastResultIndex;
currentSize = 0;
neededReserveSize = 0;
startIndex = i + 1;
}
}
}
size_t const topologicalSortSize = topologicalSort.size();
if (startIndex < topologicalSortSize) {
if ((startIndex + 1) < topologicalSortSize) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
// Copy the first group to make inplace_merge possible.
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
// For set counts <= 80, in-place merge is faster.
if (((startIndex + 1) + 80) >= topologicalSortSize) {
size_t lastSize = 0;
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
lastSize = tempGroups.size();
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
std::advance(middleIterator, lastSize);
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
}
} else {
// Use std::sort
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
}
std::sort(tempGroups.begin(), tempGroups.end());
}
result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
}
else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
}
++lastResultIndex;
}
#else
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
result.push_back(std::make_pair(false, scc));
}
#endif
return result;
}
template<typename ValueType>
void TopologicalCudaMinMaxLinearEquationSolver<ValueType>::repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const* b, uint_fast64_t n) const {
std::unique_ptr<std::vector<ValueType>> multiplyResult = std::make_unique<std::vector<ValueType>>(this->A->getRowCount());
// Now perform matrix-vector multiplication as long as we meet the bound of the formula.
for (uint_fast64_t i = 0; i < n; ++i) {
this->A->multiplyWithVector(x, *multiplyResult);
// Add b if it is non-null.
if (b != nullptr) {
storm::utility::vector::addVectors(*multiplyResult, *b, *multiplyResult);
}
// Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost
// element of the min/max operator stack.
storm::utility::vector::reduceVectorMinOrMax(dir, *multiplyResult, x, this->A->getRowGroupIndices());
}
}
template<typename ValueType>
TopologicalCudaMinMaxLinearEquationSolverFactory<ValueType>::TopologicalCudaMinMaxLinearEquationSolverFactory(bool trackScheduler) {
// Intentionally left empty.
}
template<typename ValueType>
std::unique_ptr<MinMaxLinearEquationSolver<ValueType>> TopologicalCudaMinMaxLinearEquationSolverFactory<ValueType>::create(Environment const& env) const {
STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique.");
return std::make_unique<TopologicalCudaMinMaxLinearEquationSolver<ValueType>>();
}
// Explicitly instantiate the solver.
template class TopologicalCudaMinMaxLinearEquationSolver<double>;
template class TopologicalCudaMinMaxLinearEquationSolverFactory<double>;
} // namespace solver
} // namespace storm

154
src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h

@ -0,0 +1,154 @@
#ifndef STORM_SOLVER_TOPOLOGICALCUDAMINMAXLINEAREQUATIONSOLVER_H_
#define STORM_SOLVER_TOPOLOGICALCUDAMINMAXLINEAREQUATIONSOLVER_H_
#include "storm/solver/MinMaxLinearEquationSolver.h"
#include "storm/storage/StronglyConnectedComponentDecomposition.h"
#include "storm/storage/SparseMatrix.h"
#include "storm/exceptions/NotImplementedException.h"
#include "storm/exceptions/NotSupportedException.h"
#include <utility>
#include <vector>
#include "storm-config.h"
#ifdef STORM_HAVE_CUDA
#include "cudaForStorm.h"
#endif
namespace storm {
namespace solver {
/*!
* A class that uses SCC Decompositions to solve a min/max linear equation system.
*/
template<class ValueType>
class TopologicalCudaMinMaxLinearEquationSolver : public MinMaxLinearEquationSolver<ValueType> {
public:
TopologicalCudaMinMaxLinearEquationSolver();
/*!
* Constructs a min-max linear equation solver with parameters being set according to the settings
* object.
*
* @param A The matrix defining the coefficients of the linear equation system.
*/
TopologicalCudaMinMaxLinearEquationSolver(storm::storage::SparseMatrix<ValueType> const& A);
virtual void setMatrix(storm::storage::SparseMatrix<ValueType> const& matrix) override;
virtual void setMatrix(storm::storage::SparseMatrix<ValueType>&& matrix) override;
virtual bool internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const& b) const override;
virtual void repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const* b, uint_fast64_t n) const override;
private:
storm::storage::SparseMatrix<ValueType> const* A;
std::unique_ptr<storm::storage::SparseMatrix<ValueType>> localA;
bool enableCuda;
/*!
* Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
*/
std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
};
template <typename IndexType, typename ValueType>
bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const, double const, bool const, std::vector<uint_fast64_t> const&, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const&, std::vector<ValueType>& x, std::vector<ValueType> const&, std::vector<uint_fast64_t> const&, size_t&) {
//
STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
}
template <>
inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <>
inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <typename IndexType, typename ValueType>
bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const, double const, bool const, std::vector<uint_fast64_t> const&, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const&, std::vector<ValueType>&, std::vector<ValueType> const&, std::vector<uint_fast64_t> const&, size_t&) {
STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
}
template <>
inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <>
inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template<typename ValueType>
class TopologicalCudaMinMaxLinearEquationSolverFactory : public MinMaxLinearEquationSolverFactory<ValueType> {
public:
TopologicalCudaMinMaxLinearEquationSolverFactory(bool trackScheduler = false);
protected:
virtual std::unique_ptr<MinMaxLinearEquationSolver<ValueType>> create(Environment const& env) const override;
};
} // namespace solver
} // namespace storm
#endif /* STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ */

485
src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp

@ -1,485 +0,0 @@
#include "storm/solver/TopologicalMinMaxLinearEquationSolver.h"
#include "storm/utility/vector.h"
#include "storm/utility/graph.h"
#include "storm/storage/StronglyConnectedComponentDecomposition.h"
#include "storm/exceptions/IllegalArgumentException.h"
#include "storm/exceptions/InvalidStateException.h"
#include "storm/exceptions/InvalidEnvironmentException.h"
#include "storm/environment/solver/MinMaxSolverEnvironment.h"
#include "storm/settings/SettingsManager.h"
#include "storm/settings/modules/CoreSettings.h"
#include "storm/utility/macros.h"
#include "storm-config.h"
#ifdef STORM_HAVE_CUDA
# include "cudaForStorm.h"
#endif
namespace storm {
namespace solver {
template<typename ValueType>
TopologicalMinMaxLinearEquationSolver<ValueType>::TopologicalMinMaxLinearEquationSolver() {
// Get the settings object to customize solving.
this->enableCuda = storm::settings::getModule<storm::settings::modules::CoreSettings>().isUseCudaSet();
#ifdef STORM_HAVE_CUDA
STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways.");
#endif
}
template<typename ValueType>
TopologicalMinMaxLinearEquationSolver<ValueType>::TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix<ValueType> const& A) : TopologicalMinMaxLinearEquationSolver() {
this->setMatrix(A);
}
template<typename ValueType>
void TopologicalMinMaxLinearEquationSolver<ValueType>::setMatrix(storm::storage::SparseMatrix<ValueType> const& matrix) {
this->localA = nullptr;
this->A = &matrix;
}
template<typename ValueType>
void TopologicalMinMaxLinearEquationSolver<ValueType>::setMatrix(storm::storage::SparseMatrix<ValueType>&& matrix) {
this->localA = std::make_unique<storm::storage::SparseMatrix<ValueType>>(std::move(matrix));
this->A = this->localA.get();
}
template<typename ValueType>
bool TopologicalMinMaxLinearEquationSolver<ValueType>::internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const& b) const {
STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique.");
ValueType precision = storm::utility::convertNumber<ValueType>(env.solver().minMax().getPrecision());
uint64_t maxIters = env.solver().minMax().getMaximalNumberOfIterations();
bool relative = env.solver().minMax().getMaximalNumberOfIterations();
#ifdef GPU_USE_FLOAT
#define __FORCE_FLOAT_CALCULATION true
#else
#define __FORCE_FLOAT_CALCULATION false
#endif
if (__FORCE_FLOAT_CALCULATION && std::is_same<ValueType, double>::value) {
// FIXME: This actually allocates quite some storage, because of this conversion, is it really necessary?
storm::storage::SparseMatrix<float> newA = this->A->template toValueType<float>();
TopologicalMinMaxLinearEquationSolver<float> newSolver(newA);
std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
bool callConverged = newSolver.solveEquations(env, dir, new_x, new_b);
for (size_t i = 0, size = new_x.size(); i < size; ++i) {
x.at(i) = new_x.at(i);
}
return callConverged;
}
// For testing only
if (sizeof(ValueType) == sizeof(double)) {
//std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
STORM_LOG_INFO("<<< Using CUDA-DOUBLE Kernels >>>");
} else {
//std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
STORM_LOG_INFO("<<< Using CUDA-FLOAT Kernels >>>");
}
// Now, we need to determine the SCCs of the MDP and perform a topological sort.
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = this->A->getRowGroupIndices();
// Check if the decomposition is necessary
#ifdef STORM_HAVE_CUDA
#define __USE_CUDAFORSTORM_OPT true
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A->getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A->getEntryCount()));
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
#else
#define __USE_CUDAFORSTORM_OPT false
size_t const gpuSizeOfCompleteSystem = 0;
size_t const cudaFreeMemory = 0;
#endif
std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition;
if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) {
// Dummy output for SCC Times
//std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
#ifdef STORM_HAVE_CUDA
STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
bool result = false;
size_t globalIterations = 0;
if (dir == OptimizationDirection::Minimize) {
result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
} else {
result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
}
STORM_LOG_INFO("Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
bool converged = false;
if (!result) {
converged = false;
STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue.");
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
} else {
converged = true;
}
// Check if the solver converged and issue a warning otherwise.
if (converged) {
STORM_LOG_INFO("Iterative solver converged after " << globalIterations << " iterations.");
} else {
STORM_LOG_WARN("Iterative solver did not converged after " << globalIterations << " iterations.");
}
#else
STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!");
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!";
#endif
} else {
storm::storage::BitVector fullSystem(this->A->getRowGroupCount(), true);
storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(*this->A, fullSystem, false, false);
STORM_LOG_THROW(sccDecomposition.size() > 0, storm::exceptions::IllegalArgumentException, "Can not solve given equation system as the SCC decomposition returned no SCCs.");
storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = sccDecomposition.extractPartitionDependencyGraph(*this->A);
std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
// Calculate the optimal distribution of sccs
std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, *this->A);
STORM_LOG_INFO("Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
std::vector<ValueType>* currentX = nullptr;
std::vector<ValueType>* swap = nullptr;
size_t currentMaxLocalIterations = 0;
size_t localIterations = 0;
size_t globalIterations = 0;
bool converged = true;
// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
// solved after all SCCs it depends on have been solved.
for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
bool const useGpu = sccIndexIt->first;
storm::storage::StateBlock const& scc = sccIndexIt->second;
// Generate a sub matrix
storm::storage::BitVector subMatrixIndices(this->A->getColumnCount(), scc.cbegin(), scc.cend());
storm::storage::SparseMatrix<ValueType> sccSubmatrix = this->A->getSubmatrix(true, subMatrixIndices, subMatrixIndices);
std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
// Prepare the pointers for swapping in the calculation
currentX = &sccSubX;
swap = &sccSubXSwap;
storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
sccSubNondeterministicChoiceIndices.at(0) = 0;
// Pre-process all dependent states
// Remove outgoing transitions and create the ChoiceIndices
uint_fast64_t innerIndex = 0;
uint_fast64_t outerIndex = 0;
for (uint_fast64_t state : scc) {
// Choice Indices
sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
typename storm::storage::SparseMatrix<ValueType>::const_rows row = this->A->getRow(rowGroupIt);
for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
if (!subMatrixIndices.get(rowIt->getColumn())) {
// This is an outgoing transition of a state in the SCC to a state not included in the SCC
// Subtracting Pr(tau) * x_other from b fixes that
sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
}
}
++innerIndex;
}
++outerIndex;
}
// For the current SCC, we need to perform value iteration until convergence.
if (useGpu) {
#ifdef STORM_HAVE_CUDA
STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA-based equation solver.");
//STORM_LOG_INFO("Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
//STORM_LOG_INFO("We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
//STORM_LOG_INFO("The CUDA Runtime Version is " << getRuntimeCudaVersion());
bool result = false;
localIterations = 0;
if (dir == OptimizationDirection::Minimum) {
result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
} else {
result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
}
STORM_LOG_INFO("Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
if (!result) {
converged = false;
STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue.");
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
} else {
converged = true;
}
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
globalIterations += localIterations;
#else
STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!");
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!";
#endif
} else {
//std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl;
STORM_LOG_INFO("Performance Warning: Using CPU based TopoSolver! (double)");
localIterations = 0;
converged = false;
while (!converged && localIterations < maxIters) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectors<ValueType>(sccMultiplyResult, sccSubB, sccMultiplyResult);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
storm::utility::vector::reduceVectorMinOrMax<ValueType>(dir,sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, precision, relative);
// Update environment variables.
std::swap(currentX, swap);
++localIterations;
++globalIterations;
}
STORM_LOG_INFO("Executed " << localIterations << " of max. " << maxIters << " Iterations.");
}
// The Result of this SCC has to be taken back into the main result vector
innerIndex = 0;
for (uint_fast64_t state : scc) {
x.at(state) = currentX->at(innerIndex);
++innerIndex;
}
// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
currentX = nullptr;
swap = nullptr;
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
}
//std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl;
// Check if the solver converged and issue a warning otherwise.
if (converged) {
STORM_LOG_INFO("Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
} else {
STORM_LOG_WARN("Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
}
return converged;
}
}
template<typename ValueType>
std::vector<std::pair<bool, storm::storage::StateBlock>>
TopologicalMinMaxLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
(void)matrix;
std::vector<std::pair<bool, storm::storage::StateBlock>> result;
#ifdef STORM_HAVE_CUDA
// 95% to have a bit of padding
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
size_t lastResultIndex = 0;
std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount()));
size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1));
size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup;
size_t currentSize = 0;
size_t neededReserveSize = 0;
size_t startIndex = 0;
for (size_t i = 0; i < topologicalSort.size(); ++i) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]];
size_t const currentSccSize = scc.size();
uint_fast64_t rowCount = 0;
uint_fast64_t entryCount = 0;
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
rowCount += matrix.getRowGroupSize(*sccIt);
entryCount += matrix.getRowGroupEntryCount(*sccIt);
}
size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
if ((currentSize + sccSize) <= cudaFreeMemory) {
// There is enough space left in the current group
neededReserveSize += currentSccSize;
currentSize += sccSize;
} else {
// This would make the last open group to big for the GPU
if (startIndex < i) {
if ((startIndex + 1) < i) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
// Copy the first group to make inplace_merge possible
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
if (((startIndex + 1) + 80) >= i) {
size_t lastSize = 0;
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
lastSize = tempGroups.size();
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
std::advance(middleIterator, lastSize);
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
}
} else {
// Use std::sort
for (size_t j = startIndex + 1; j < i; ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
}
std::sort(tempGroups.begin(), tempGroups.end());
}
result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
} else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
}
++lastResultIndex;
}
if (sccSize <= cudaFreeMemory) {
currentSize = sccSize;
neededReserveSize = currentSccSize;
startIndex = i;
} else {
// This group is too big to fit into the CUDA Memory by itself
result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]]))));
++lastResultIndex;
currentSize = 0;
neededReserveSize = 0;
startIndex = i + 1;
}
}
}
size_t const topologicalSortSize = topologicalSort.size();
if (startIndex < topologicalSortSize) {
if ((startIndex + 1) < topologicalSortSize) {
// More than one component
std::vector<uint_fast64_t> tempGroups;
tempGroups.reserve(neededReserveSize);
// Copy the first group to make inplace_merge possible.
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
// For set counts <= 80, in-place merge is faster.
if (((startIndex + 1) + 80) >= topologicalSortSize) {
size_t lastSize = 0;
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
lastSize = tempGroups.size();
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
std::advance(middleIterator, lastSize);
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
}
} else {
// Use std::sort
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
}
std::sort(tempGroups.begin(), tempGroups.end());
}
result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
}
else {
// Only one group, copy construct.
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
}
++lastResultIndex;
}
#else
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
result.push_back(std::make_pair(false, scc));
}
#endif
return result;
}
template<typename ValueType>
void TopologicalMinMaxLinearEquationSolver<ValueType>::repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const* b, uint_fast64_t n) const {
std::unique_ptr<std::vector<ValueType>> multiplyResult = std::make_unique<std::vector<ValueType>>(this->A->getRowCount());
// Now perform matrix-vector multiplication as long as we meet the bound of the formula.
for (uint_fast64_t i = 0; i < n; ++i) {
this->A->multiplyWithVector(x, *multiplyResult);
// Add b if it is non-null.
if (b != nullptr) {
storm::utility::vector::addVectors(*multiplyResult, *b, *multiplyResult);
}
// Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost
// element of the min/max operator stack.
storm::utility::vector::reduceVectorMinOrMax(dir, *multiplyResult, x, this->A->getRowGroupIndices());
}
}
template<typename ValueType>
TopologicalMinMaxLinearEquationSolverFactory<ValueType>::TopologicalMinMaxLinearEquationSolverFactory(bool trackScheduler) {
// Intentionally left empty.
}
template<typename ValueType>
std::unique_ptr<MinMaxLinearEquationSolver<ValueType>> TopologicalMinMaxLinearEquationSolverFactory<ValueType>::create(Environment const& env) const {
STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique.");
return std::make_unique<TopologicalMinMaxLinearEquationSolver<ValueType>>();
}
// Explicitly instantiate the solver.
template class TopologicalMinMaxLinearEquationSolver<double>;
template class TopologicalMinMaxLinearEquationSolverFactory<double>;
} // namespace solver
} // namespace storm

154
src/storm/solver/TopologicalMinMaxLinearEquationSolver.h

@ -1,154 +0,0 @@
#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_
#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_
#include "storm/solver/MinMaxLinearEquationSolver.h"
#include "storm/storage/StronglyConnectedComponentDecomposition.h"
#include "storm/storage/SparseMatrix.h"
#include "storm/exceptions/NotImplementedException.h"
#include "storm/exceptions/NotSupportedException.h"
#include <utility>
#include <vector>
#include "storm-config.h"
#ifdef STORM_HAVE_CUDA
#include "cudaForStorm.h"
#endif
namespace storm {
namespace solver {
/*!
* A class that uses SCC Decompositions to solve a min/max linear equation system.
*/
template<class ValueType>
class TopologicalMinMaxLinearEquationSolver : public MinMaxLinearEquationSolver<ValueType> {
public:
TopologicalMinMaxLinearEquationSolver();
/*!
* Constructs a min-max linear equation solver with parameters being set according to the settings
* object.
*
* @param A The matrix defining the coefficients of the linear equation system.
*/
TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix<ValueType> const& A);
virtual void setMatrix(storm::storage::SparseMatrix<ValueType> const& matrix) override;
virtual void setMatrix(storm::storage::SparseMatrix<ValueType>&& matrix) override;
virtual bool internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const& b) const override;
virtual void repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector<ValueType>& x, std::vector<ValueType> const* b, uint_fast64_t n) const override;
private:
storm::storage::SparseMatrix<ValueType> const* A;
std::unique_ptr<storm::storage::SparseMatrix<ValueType>> localA;
bool enableCuda;
/*!
* Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
*/
std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
};
template <typename IndexType, typename ValueType>
bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const, double const, bool const, std::vector<uint_fast64_t> const&, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const&, std::vector<ValueType>& x, std::vector<ValueType> const&, std::vector<uint_fast64_t> const&, size_t&) {
//
STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
}
template <>
inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <>
inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <typename IndexType, typename ValueType>
bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const, double const, bool const, std::vector<uint_fast64_t> const&, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const&, std::vector<ValueType>&, std::vector<ValueType> const&, std::vector<uint_fast64_t> const&, size_t&) {
STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
}
template <>
inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template <>
inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
(void)maxIterationCount;
(void)precision;
(void)relativePrecisionCheck;
(void)matrixRowIndices;
(void)columnIndicesAndValues;
(void)x;
(void)b;
(void)nondeterministicChoiceIndices;
(void)iterationCount;
#ifdef STORM_HAVE_CUDA
return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
#else
STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support.");
#endif
}
template<typename ValueType>
class TopologicalMinMaxLinearEquationSolverFactory : public MinMaxLinearEquationSolverFactory<ValueType> {
public:
TopologicalMinMaxLinearEquationSolverFactory(bool trackScheduler = false);
protected:
virtual std::unique_ptr<MinMaxLinearEquationSolver<ValueType>> create(Environment const& env) const override;
};
} // namespace solver
} // namespace storm
#endif /* STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ */

12
src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp

@ -45,6 +45,17 @@ namespace {
return env;
}
};
class DoubleTopologicalCudaViEnvironment {
public:
typedef double ValueType;
static const bool isExact = false;
static storm::Environment createEnvironment() {
storm::Environment env;
env.solver().minMax().setMethod(storm::solver::MinMaxMethod::TopologicalCuda);
env.solver().minMax().setPrecision(storm::utility::convertNumber<storm::RationalNumber>(1e-8));
return env;
}
};
class DoublePIEnvironment {
public:
typedef double ValueType;
@ -96,6 +107,7 @@ namespace {
DoubleViEnvironment,
DoubleSoundViEnvironment,
DoubleTopologicalViEnvironment,
DoubleTopologicalCudaViEnvironment,
DoublePIEnvironment,
RationalPIEnvironment,
RationalRationalSearchEnvironment

Loading…
Cancel
Save