diff --git a/src/storm/solver/MinMaxLinearEquationSolver.cpp b/src/storm/solver/MinMaxLinearEquationSolver.cpp index c250c17a0..4a5066079 100644 --- a/src/storm/solver/MinMaxLinearEquationSolver.cpp +++ b/src/storm/solver/MinMaxLinearEquationSolver.cpp @@ -5,6 +5,7 @@ #include "storm/solver/LinearEquationSolver.h" #include "storm/solver/IterativeMinMaxLinearEquationSolver.h" #include "storm/solver/TopologicalMinMaxLinearEquationSolver.h" +#include "storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h" #include "storm/solver/LpMinMaxLinearEquationSolver.h" #include "storm/environment/solver/MinMaxSolverEnvironment.h" @@ -198,8 +199,8 @@ namespace storm { auto method = env.solver().minMax().getMethod(); if (method == MinMaxMethod::ValueIteration || method == MinMaxMethod::PolicyIteration || method == MinMaxMethod::RationalSearch || method == MinMaxMethod::QuickValueIteration) { result = std::make_unique>(std::make_unique>()); - } else if (method == MinMaxMethod::Topological) { - result = std::make_unique>(); + } else if (method == MinMaxMethod::TopologicalCuda) { + result = std::make_unique>(); } else if (method == MinMaxMethod::LinearProgramming) { result = std::make_unique>(std::make_unique>(), std::make_unique>()); } else { diff --git a/src/storm/solver/SolverSelectionOptions.cpp b/src/storm/solver/SolverSelectionOptions.cpp index 3461f2bc6..f2537f1a4 100644 --- a/src/storm/solver/SolverSelectionOptions.cpp +++ b/src/storm/solver/SolverSelectionOptions.cpp @@ -16,6 +16,8 @@ namespace storm { return "ratsearch"; case MinMaxMethod::QuickValueIteration: return "QuickValueIteration"; + case MinMaxMethod::TopologicalCuda: + return "topologicalcuda"; } return "invalid"; } diff --git a/src/storm/solver/SolverSelectionOptions.h b/src/storm/solver/SolverSelectionOptions.h index fc9750611..e2ffdb5dd 100644 --- a/src/storm/solver/SolverSelectionOptions.h +++ b/src/storm/solver/SolverSelectionOptions.h @@ -6,7 +6,7 @@ namespace storm { namespace solver { - ExtendEnumsWithSelectionField(MinMaxMethod, PolicyIteration, ValueIteration, LinearProgramming, Topological, RationalSearch, QuickValueIteration) + ExtendEnumsWithSelectionField(MinMaxMethod, PolicyIteration, ValueIteration, LinearProgramming, Topological, RationalSearch, QuickValueIteration, TopologicalCuda) ExtendEnumsWithSelectionField(GameMethod, PolicyIteration, ValueIteration) ExtendEnumsWithSelectionField(LraMethod, LinearProgramming, ValueIteration) diff --git a/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.cpp b/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.cpp new file mode 100644 index 000000000..083e2581f --- /dev/null +++ b/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.cpp @@ -0,0 +1,485 @@ +#include "storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h" + +#include "storm/utility/vector.h" +#include "storm/utility/graph.h" +#include "storm/storage/StronglyConnectedComponentDecomposition.h" +#include "storm/exceptions/IllegalArgumentException.h" +#include "storm/exceptions/InvalidStateException.h" +#include "storm/exceptions/InvalidEnvironmentException.h" + +#include "storm/environment/solver/MinMaxSolverEnvironment.h" + +#include "storm/settings/SettingsManager.h" +#include "storm/settings/modules/CoreSettings.h" + +#include "storm/utility/macros.h" +#include "storm-config.h" +#ifdef STORM_HAVE_CUDA +# include "cudaForStorm.h" +#endif + +namespace storm { + namespace solver { + + template + TopologicalCudaMinMaxLinearEquationSolver::TopologicalCudaMinMaxLinearEquationSolver() { + // Get the settings object to customize solving. + this->enableCuda = storm::settings::getModule().isUseCudaSet(); +#ifdef STORM_HAVE_CUDA + STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways."); +#endif + } + + template + TopologicalCudaMinMaxLinearEquationSolver::TopologicalCudaMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A) : TopologicalCudaMinMaxLinearEquationSolver() { + this->setMatrix(A); + } + + template + void TopologicalCudaMinMaxLinearEquationSolver::setMatrix(storm::storage::SparseMatrix const& matrix) { + this->localA = nullptr; + this->A = &matrix; + } + + template + void TopologicalCudaMinMaxLinearEquationSolver::setMatrix(storm::storage::SparseMatrix&& matrix) { + this->localA = std::make_unique>(std::move(matrix)); + this->A = this->localA.get(); + } + + template + bool TopologicalCudaMinMaxLinearEquationSolver::internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const& b) const { + STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique."); + + ValueType precision = storm::utility::convertNumber(env.solver().minMax().getPrecision()); + uint64_t maxIters = env.solver().minMax().getMaximalNumberOfIterations(); + bool relative = env.solver().minMax().getMaximalNumberOfIterations(); + +#ifdef GPU_USE_FLOAT +#define __FORCE_FLOAT_CALCULATION true +#else +#define __FORCE_FLOAT_CALCULATION false +#endif + if (__FORCE_FLOAT_CALCULATION && std::is_same::value) { + // FIXME: This actually allocates quite some storage, because of this conversion, is it really necessary? + storm::storage::SparseMatrix newA = this->A->template toValueType(); + + TopologicalCudaMinMaxLinearEquationSolver newSolver(newA); + + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); + + bool callConverged = newSolver.solveEquations(env, dir, new_x, new_b); + + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); + } + return callConverged; + } + + // For testing only + if (sizeof(ValueType) == sizeof(double)) { + //std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; + STORM_LOG_INFO("<<< Using CUDA-DOUBLE Kernels >>>"); + } else { + //std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + STORM_LOG_INFO("<<< Using CUDA-FLOAT Kernels >>>"); + } + + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = this->A->getRowGroupIndices(); + + // Check if the decomposition is necessary +#ifdef STORM_HAVE_CUDA +#define __USE_CUDAFORSTORM_OPT true + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A->getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A->getEntryCount())); + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); +#else +#define __USE_CUDAFORSTORM_OPT false + size_t const gpuSizeOfCompleteSystem = 0; + size_t const cudaFreeMemory = 0; +#endif + std::vector> sccDecomposition; + if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { + // Dummy output for SCC Times + //std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; + +#ifdef STORM_HAVE_CUDA + STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + + bool result = false; + size_t globalIterations = 0; + if (dir == OptimizationDirection::Minimize) { + result = __basicValueIteration_mvReduce_minimize(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } else { + result = __basicValueIteration_mvReduce_maximize(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } + STORM_LOG_INFO("Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + bool converged = false; + if (!result) { + converged = false; + STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + STORM_LOG_INFO("Iterative solver converged after " << globalIterations << " iterations."); + } else { + STORM_LOG_WARN("Iterative solver did not converged after " << globalIterations << " iterations."); + } +#else + STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"; +#endif + } else { + storm::storage::BitVector fullSystem(this->A->getRowGroupCount(), true); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*this->A, fullSystem, false, false); + + STORM_LOG_THROW(sccDecomposition.size() > 0, storm::exceptions::IllegalArgumentException, "Can not solve given equation system as the SCC decomposition returned no SCCs."); + + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = sccDecomposition.extractPartitionDependencyGraph(*this->A); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, *this->A); + STORM_LOG_INFO("Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(this->A->getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = this->A->getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + typename storm::storage::SparseMatrix::const_rows row = this->A->getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } + } + ++innerIndex; + } + ++outerIndex; + } + + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { +#ifdef STORM_HAVE_CUDA + STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA-based equation solver."); + + //STORM_LOG_INFO("Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //STORM_LOG_INFO("We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //STORM_LOG_INFO("The CUDA Runtime Version is " << getRuntimeCudaVersion()); + + bool result = false; + localIterations = 0; + if (dir == OptimizationDirection::Minimum) { + result = __basicValueIteration_mvReduce_minimize(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = __basicValueIteration_mvReduce_maximize(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + STORM_LOG_INFO("Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + if (!result) { + converged = false; + STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + globalIterations += localIterations; +#else + STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"; +#endif + } else { + //std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; + STORM_LOG_INFO("Performance Warning: Using CPU based TopoSolver! (double)"); + localIterations = 0; + converged = false; + while (!converged && localIterations < maxIters) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectors(sccMultiplyResult, sccSubB, sccMultiplyResult); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + storm::utility::vector::reduceVectorMinOrMax(dir,sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, precision, relative); + + // Update environment variables. + std::swap(currentX, swap); + + ++localIterations; + ++globalIterations; + } + STORM_LOG_INFO("Executed " << localIterations << " of max. " << maxIters << " Iterations."); + } + + + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; + } + + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + } + + //std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + STORM_LOG_INFO("Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + STORM_LOG_WARN("Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + } + + return converged; + } + } + + template + std::vector> + TopologicalCudaMinMaxLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { + + (void)matrix; + + std::vector> result; + +#ifdef STORM_HAVE_CUDA + // 95% to have a bit of padding + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); + size_t lastResultIndex = 0; + + std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); + size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); + size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; + + size_t currentSize = 0; + size_t neededReserveSize = 0; + size_t startIndex = 0; + for (size_t i = 0; i < topologicalSort.size(); ++i) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; + size_t const currentSccSize = scc.size(); + + uint_fast64_t rowCount = 0; + uint_fast64_t entryCount = 0; + + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowCount += matrix.getRowGroupSize(*sccIt); + entryCount += matrix.getRowGroupEntryCount(*sccIt); + } + + size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); + + if ((currentSize + sccSize) <= cudaFreeMemory) { + // There is enough space left in the current group + neededReserveSize += currentSccSize; + currentSize += sccSize; + } else { + // This would make the last open group to big for the GPU + + if (startIndex < i) { + if ((startIndex + 1) < i) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + + // Copy the first group to make inplace_merge possible + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + if (((startIndex + 1) + 80) >= i) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < i; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + } + result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); + } else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; + } + + if (sccSize <= cudaFreeMemory) { + currentSize = sccSize; + neededReserveSize = currentSccSize; + startIndex = i; + } else { + // This group is too big to fit into the CUDA Memory by itself + result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); + ++lastResultIndex; + + currentSize = 0; + neededReserveSize = 0; + startIndex = i + 1; + } + } + } + + size_t const topologicalSortSize = topologicalSort.size(); + if (startIndex < topologicalSortSize) { + if ((startIndex + 1) < topologicalSortSize) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + + // Copy the first group to make inplace_merge possible. + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + // For set counts <= 80, in-place merge is faster. + if (((startIndex + 1) + 80) >= topologicalSortSize) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + } + result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); + } + else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; + } +#else + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + result.push_back(std::make_pair(false, scc)); + } +#endif + return result; + } + + template + void TopologicalCudaMinMaxLinearEquationSolver::repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const* b, uint_fast64_t n) const { + std::unique_ptr> multiplyResult = std::make_unique>(this->A->getRowCount()); + + // Now perform matrix-vector multiplication as long as we meet the bound of the formula. + for (uint_fast64_t i = 0; i < n; ++i) { + this->A->multiplyWithVector(x, *multiplyResult); + + // Add b if it is non-null. + if (b != nullptr) { + storm::utility::vector::addVectors(*multiplyResult, *b, *multiplyResult); + } + + // Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost + // element of the min/max operator stack. + storm::utility::vector::reduceVectorMinOrMax(dir, *multiplyResult, x, this->A->getRowGroupIndices()); + } + } + + template + TopologicalCudaMinMaxLinearEquationSolverFactory::TopologicalCudaMinMaxLinearEquationSolverFactory(bool trackScheduler) { + // Intentionally left empty. + } + + template + std::unique_ptr> TopologicalCudaMinMaxLinearEquationSolverFactory::create(Environment const& env) const { + STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique."); + return std::make_unique>(); + } + + // Explicitly instantiate the solver. + template class TopologicalCudaMinMaxLinearEquationSolver; + + template class TopologicalCudaMinMaxLinearEquationSolverFactory; + } // namespace solver +} // namespace storm diff --git a/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h b/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h new file mode 100644 index 000000000..e302028a5 --- /dev/null +++ b/src/storm/solver/TopologicalCudaMinMaxLinearEquationSolver.h @@ -0,0 +1,154 @@ +#ifndef STORM_SOLVER_TOPOLOGICALCUDAMINMAXLINEAREQUATIONSOLVER_H_ +#define STORM_SOLVER_TOPOLOGICALCUDAMINMAXLINEAREQUATIONSOLVER_H_ + +#include "storm/solver/MinMaxLinearEquationSolver.h" +#include "storm/storage/StronglyConnectedComponentDecomposition.h" +#include "storm/storage/SparseMatrix.h" +#include "storm/exceptions/NotImplementedException.h" +#include "storm/exceptions/NotSupportedException.h" + +#include +#include + +#include "storm-config.h" +#ifdef STORM_HAVE_CUDA +#include "cudaForStorm.h" +#endif + +namespace storm { + namespace solver { + + /*! + * A class that uses SCC Decompositions to solve a min/max linear equation system. + */ + template + class TopologicalCudaMinMaxLinearEquationSolver : public MinMaxLinearEquationSolver { + public: + TopologicalCudaMinMaxLinearEquationSolver(); + + /*! + * Constructs a min-max linear equation solver with parameters being set according to the settings + * object. + * + * @param A The matrix defining the coefficients of the linear equation system. + */ + TopologicalCudaMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A); + + virtual void setMatrix(storm::storage::SparseMatrix const& matrix) override; + virtual void setMatrix(storm::storage::SparseMatrix&& matrix) override; + + virtual bool internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const& b) const override; + + virtual void repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const* b, uint_fast64_t n) const override; + + private: + storm::storage::SparseMatrix const* A; + std::unique_ptr> localA; + + bool enableCuda; + /*! + * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. + */ + std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; + }; + + template + bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const, double const, bool const, std::vector const&, std::vector> const&, std::vector& x, std::vector const&, std::vector const&, size_t&) { + // + STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments."); + } + template <> + inline bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + + (void)maxIterationCount; + (void)precision; + (void)relativePrecisionCheck; + (void)matrixRowIndices; + (void)columnIndicesAndValues; + (void)x; + (void)b; + (void)nondeterministicChoiceIndices; + (void)iterationCount; + +#ifdef STORM_HAVE_CUDA + return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + + (void)maxIterationCount; + (void)precision; + (void)relativePrecisionCheck; + (void)matrixRowIndices; + (void)columnIndicesAndValues; + (void)x; + (void)b; + (void)nondeterministicChoiceIndices; + (void)iterationCount; + +#ifdef STORM_HAVE_CUDA + return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); +#endif + } + + template + bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const, double const, bool const, std::vector const&, std::vector> const&, std::vector&, std::vector const&, std::vector const&, size_t&) { + STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments."); + } + template <> + inline bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + + (void)maxIterationCount; + (void)precision; + (void)relativePrecisionCheck; + (void)matrixRowIndices; + (void)columnIndicesAndValues; + (void)x; + (void)b; + (void)nondeterministicChoiceIndices; + (void)iterationCount; + +#ifdef STORM_HAVE_CUDA + return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + + (void)maxIterationCount; + (void)precision; + (void)relativePrecisionCheck; + (void)matrixRowIndices; + (void)columnIndicesAndValues; + (void)x; + (void)b; + (void)nondeterministicChoiceIndices; + (void)iterationCount; + +#ifdef STORM_HAVE_CUDA + return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); +#endif + } + + template + class TopologicalCudaMinMaxLinearEquationSolverFactory : public MinMaxLinearEquationSolverFactory { + public: + TopologicalCudaMinMaxLinearEquationSolverFactory(bool trackScheduler = false); + + protected: + virtual std::unique_ptr> create(Environment const& env) const override; + }; + + } // namespace solver +} // namespace storm + +#endif /* STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ */ diff --git a/src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp b/src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp index 9e33a4c9e..e69de29bb 100644 --- a/src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp +++ b/src/storm/solver/TopologicalMinMaxLinearEquationSolver.cpp @@ -1,485 +0,0 @@ -#include "storm/solver/TopologicalMinMaxLinearEquationSolver.h" - -#include "storm/utility/vector.h" -#include "storm/utility/graph.h" -#include "storm/storage/StronglyConnectedComponentDecomposition.h" -#include "storm/exceptions/IllegalArgumentException.h" -#include "storm/exceptions/InvalidStateException.h" -#include "storm/exceptions/InvalidEnvironmentException.h" - -#include "storm/environment/solver/MinMaxSolverEnvironment.h" - -#include "storm/settings/SettingsManager.h" -#include "storm/settings/modules/CoreSettings.h" - -#include "storm/utility/macros.h" -#include "storm-config.h" -#ifdef STORM_HAVE_CUDA -# include "cudaForStorm.h" -#endif - -namespace storm { - namespace solver { - - template - TopologicalMinMaxLinearEquationSolver::TopologicalMinMaxLinearEquationSolver() { - // Get the settings object to customize solving. - this->enableCuda = storm::settings::getModule().isUseCudaSet(); -#ifdef STORM_HAVE_CUDA - STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways."); -#endif - } - - template - TopologicalMinMaxLinearEquationSolver::TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A) : TopologicalMinMaxLinearEquationSolver() { - this->setMatrix(A); - } - - template - void TopologicalMinMaxLinearEquationSolver::setMatrix(storm::storage::SparseMatrix const& matrix) { - this->localA = nullptr; - this->A = &matrix; - } - - template - void TopologicalMinMaxLinearEquationSolver::setMatrix(storm::storage::SparseMatrix&& matrix) { - this->localA = std::make_unique>(std::move(matrix)); - this->A = this->localA.get(); - } - - template - bool TopologicalMinMaxLinearEquationSolver::internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const& b) const { - STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique."); - - ValueType precision = storm::utility::convertNumber(env.solver().minMax().getPrecision()); - uint64_t maxIters = env.solver().minMax().getMaximalNumberOfIterations(); - bool relative = env.solver().minMax().getMaximalNumberOfIterations(); - -#ifdef GPU_USE_FLOAT -#define __FORCE_FLOAT_CALCULATION true -#else -#define __FORCE_FLOAT_CALCULATION false -#endif - if (__FORCE_FLOAT_CALCULATION && std::is_same::value) { - // FIXME: This actually allocates quite some storage, because of this conversion, is it really necessary? - storm::storage::SparseMatrix newA = this->A->template toValueType(); - - TopologicalMinMaxLinearEquationSolver newSolver(newA); - - std::vector new_x = storm::utility::vector::toValueType(x); - std::vector const new_b = storm::utility::vector::toValueType(b); - - bool callConverged = newSolver.solveEquations(env, dir, new_x, new_b); - - for (size_t i = 0, size = new_x.size(); i < size; ++i) { - x.at(i) = new_x.at(i); - } - return callConverged; - } - - // For testing only - if (sizeof(ValueType) == sizeof(double)) { - //std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; - STORM_LOG_INFO("<<< Using CUDA-DOUBLE Kernels >>>"); - } else { - //std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; - STORM_LOG_INFO("<<< Using CUDA-FLOAT Kernels >>>"); - } - - // Now, we need to determine the SCCs of the MDP and perform a topological sort. - std::vector const& nondeterministicChoiceIndices = this->A->getRowGroupIndices(); - - // Check if the decomposition is necessary -#ifdef STORM_HAVE_CUDA -#define __USE_CUDAFORSTORM_OPT true - size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A->getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A->getEntryCount())); - size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); -#else -#define __USE_CUDAFORSTORM_OPT false - size_t const gpuSizeOfCompleteSystem = 0; - size_t const cudaFreeMemory = 0; -#endif - std::vector> sccDecomposition; - if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { - // Dummy output for SCC Times - //std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; - -#ifdef STORM_HAVE_CUDA - STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - - bool result = false; - size_t globalIterations = 0; - if (dir == OptimizationDirection::Minimize) { - result = __basicValueIteration_mvReduce_minimize(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); - } else { - result = __basicValueIteration_mvReduce_maximize(maxIters, precision, relative, A->rowIndications, A->columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); - } - STORM_LOG_INFO("Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - bool converged = false; - if (!result) { - converged = false; - STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - STORM_LOG_INFO("Iterative solver converged after " << globalIterations << " iterations."); - } else { - STORM_LOG_WARN("Iterative solver did not converged after " << globalIterations << " iterations."); - } -#else - STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"; -#endif - } else { - storm::storage::BitVector fullSystem(this->A->getRowGroupCount(), true); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*this->A, fullSystem, false, false); - - STORM_LOG_THROW(sccDecomposition.size() > 0, storm::exceptions::IllegalArgumentException, "Can not solve given equation system as the SCC decomposition returned no SCCs."); - - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = sccDecomposition.extractPartitionDependencyGraph(*this->A); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Calculate the optimal distribution of sccs - std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, *this->A); - STORM_LOG_INFO("Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); - - std::vector* currentX = nullptr; - std::vector* swap = nullptr; - size_t currentMaxLocalIterations = 0; - size_t localIterations = 0; - size_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { - bool const useGpu = sccIndexIt->first; - storm::storage::StateBlock const& scc = sccIndexIt->second; - - // Generate a sub matrix - storm::storage::BitVector subMatrixIndices(this->A->getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = this->A->getSubmatrix(true, subMatrixIndices, subMatrixIndices); - std::vector sccSubB(sccSubmatrix.getRowCount()); - storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); - std::vector sccSubX(sccSubmatrix.getColumnCount()); - std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); - - // Prepare the pointers for swapping in the calculation - currentX = &sccSubX; - swap = &sccSubXSwap; - - storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) - std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); - sccSubNondeterministicChoiceIndices.at(0) = 0; - - // Pre-process all dependent states - // Remove outgoing transitions and create the ChoiceIndices - uint_fast64_t innerIndex = 0; - uint_fast64_t outerIndex = 0; - for (uint_fast64_t state : scc) { - // Choice Indices - sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); - - for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - typename storm::storage::SparseMatrix::const_rows row = this->A->getRow(rowGroupIt); - for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->getColumn())) { - // This is an outgoing transition of a state in the SCC to a state not included in the SCC - // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); - } - } - ++innerIndex; - } - ++outerIndex; - } - - // For the current SCC, we need to perform value iteration until convergence. - if (useGpu) { -#ifdef STORM_HAVE_CUDA - STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA-based equation solver."); - - //STORM_LOG_INFO("Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - //STORM_LOG_INFO("We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - //STORM_LOG_INFO("The CUDA Runtime Version is " << getRuntimeCudaVersion()); - - bool result = false; - localIterations = 0; - if (dir == OptimizationDirection::Minimum) { - result = __basicValueIteration_mvReduce_minimize(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } else { - result = __basicValueIteration_mvReduce_maximize(maxIters, precision, relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - STORM_LOG_INFO("Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - if (!result) { - converged = false; - STORM_LOG_ERROR("An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - globalIterations += localIterations; -#else - STORM_LOG_ERROR("The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of storm does not support CUDA acceleration. Internal Error!"; -#endif - } else { - //std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; - STORM_LOG_INFO("Performance Warning: Using CPU based TopoSolver! (double)"); - localIterations = 0; - converged = false; - while (!converged && localIterations < maxIters) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectors(sccMultiplyResult, sccSubB, sccMultiplyResult); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); - - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - storm::utility::vector::reduceVectorMinOrMax(dir,sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, precision, relative); - - // Update environment variables. - std::swap(currentX, swap); - - ++localIterations; - ++globalIterations; - } - STORM_LOG_INFO("Executed " << localIterations << " of max. " << maxIters << " Iterations."); - } - - - // The Result of this SCC has to be taken back into the main result vector - innerIndex = 0; - for (uint_fast64_t state : scc) { - x.at(state) = currentX->at(innerIndex); - ++innerIndex; - } - - // Since the pointers for swapping in the calculation point to temps they should not be valid anymore - currentX = nullptr; - swap = nullptr; - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - } - - //std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - STORM_LOG_INFO("Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } else { - STORM_LOG_WARN("Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); - } - - return converged; - } - } - - template - std::vector> - TopologicalMinMaxLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { - - (void)matrix; - - std::vector> result; - -#ifdef STORM_HAVE_CUDA - // 95% to have a bit of padding - size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); - size_t lastResultIndex = 0; - - std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); - - size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); - size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); - size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; - - size_t currentSize = 0; - size_t neededReserveSize = 0; - size_t startIndex = 0; - for (size_t i = 0; i < topologicalSort.size(); ++i) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; - size_t const currentSccSize = scc.size(); - - uint_fast64_t rowCount = 0; - uint_fast64_t entryCount = 0; - - for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { - rowCount += matrix.getRowGroupSize(*sccIt); - entryCount += matrix.getRowGroupEntryCount(*sccIt); - } - - size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); - - if ((currentSize + sccSize) <= cudaFreeMemory) { - // There is enough space left in the current group - neededReserveSize += currentSccSize; - currentSize += sccSize; - } else { - // This would make the last open group to big for the GPU - - if (startIndex < i) { - if ((startIndex + 1) < i) { - // More than one component - std::vector tempGroups; - tempGroups.reserve(neededReserveSize); - - // Copy the first group to make inplace_merge possible - storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; - tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); - - if (((startIndex + 1) + 80) >= i) { - size_t lastSize = 0; - for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - lastSize = tempGroups.size(); - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - std::vector::iterator middleIterator = tempGroups.begin(); - std::advance(middleIterator, lastSize); - std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); - } - } else { - // Use std::sort - for (size_t j = startIndex + 1; j < i; ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - } - std::sort(tempGroups.begin(), tempGroups.end()); - } - result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); - } else { - // Only one group, copy construct. - result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); - } - ++lastResultIndex; - } - - if (sccSize <= cudaFreeMemory) { - currentSize = sccSize; - neededReserveSize = currentSccSize; - startIndex = i; - } else { - // This group is too big to fit into the CUDA Memory by itself - result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); - ++lastResultIndex; - - currentSize = 0; - neededReserveSize = 0; - startIndex = i + 1; - } - } - } - - size_t const topologicalSortSize = topologicalSort.size(); - if (startIndex < topologicalSortSize) { - if ((startIndex + 1) < topologicalSortSize) { - // More than one component - std::vector tempGroups; - tempGroups.reserve(neededReserveSize); - - // Copy the first group to make inplace_merge possible. - storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; - tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); - - // For set counts <= 80, in-place merge is faster. - if (((startIndex + 1) + 80) >= topologicalSortSize) { - size_t lastSize = 0; - for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - lastSize = tempGroups.size(); - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - std::vector::iterator middleIterator = tempGroups.begin(); - std::advance(middleIterator, lastSize); - std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); - } - } else { - // Use std::sort - for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - } - std::sort(tempGroups.begin(), tempGroups.end()); - } - result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); - } - else { - // Only one group, copy construct. - result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); - } - ++lastResultIndex; - } -#else - for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { - storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - result.push_back(std::make_pair(false, scc)); - } -#endif - return result; - } - - template - void TopologicalMinMaxLinearEquationSolver::repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const* b, uint_fast64_t n) const { - std::unique_ptr> multiplyResult = std::make_unique>(this->A->getRowCount()); - - // Now perform matrix-vector multiplication as long as we meet the bound of the formula. - for (uint_fast64_t i = 0; i < n; ++i) { - this->A->multiplyWithVector(x, *multiplyResult); - - // Add b if it is non-null. - if (b != nullptr) { - storm::utility::vector::addVectors(*multiplyResult, *b, *multiplyResult); - } - - // Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost - // element of the min/max operator stack. - storm::utility::vector::reduceVectorMinOrMax(dir, *multiplyResult, x, this->A->getRowGroupIndices()); - } - } - - template - TopologicalMinMaxLinearEquationSolverFactory::TopologicalMinMaxLinearEquationSolverFactory(bool trackScheduler) { - // Intentionally left empty. - } - - template - std::unique_ptr> TopologicalMinMaxLinearEquationSolverFactory::create(Environment const& env) const { - STORM_LOG_THROW(env.solver().minMax().getMethod() == MinMaxMethod::Topological, storm::exceptions::InvalidEnvironmentException, "This min max solver does not support the selected technique."); - return std::make_unique>(); - } - - // Explicitly instantiate the solver. - template class TopologicalMinMaxLinearEquationSolver; - - template class TopologicalMinMaxLinearEquationSolverFactory; - } // namespace solver -} // namespace storm diff --git a/src/storm/solver/TopologicalMinMaxLinearEquationSolver.h b/src/storm/solver/TopologicalMinMaxLinearEquationSolver.h index 7bf35c2d2..e69de29bb 100644 --- a/src/storm/solver/TopologicalMinMaxLinearEquationSolver.h +++ b/src/storm/solver/TopologicalMinMaxLinearEquationSolver.h @@ -1,154 +0,0 @@ -#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ -#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ - -#include "storm/solver/MinMaxLinearEquationSolver.h" -#include "storm/storage/StronglyConnectedComponentDecomposition.h" -#include "storm/storage/SparseMatrix.h" -#include "storm/exceptions/NotImplementedException.h" -#include "storm/exceptions/NotSupportedException.h" - -#include -#include - -#include "storm-config.h" -#ifdef STORM_HAVE_CUDA -#include "cudaForStorm.h" -#endif - -namespace storm { - namespace solver { - - /*! - * A class that uses SCC Decompositions to solve a min/max linear equation system. - */ - template - class TopologicalMinMaxLinearEquationSolver : public MinMaxLinearEquationSolver { - public: - TopologicalMinMaxLinearEquationSolver(); - - /*! - * Constructs a min-max linear equation solver with parameters being set according to the settings - * object. - * - * @param A The matrix defining the coefficients of the linear equation system. - */ - TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A); - - virtual void setMatrix(storm::storage::SparseMatrix const& matrix) override; - virtual void setMatrix(storm::storage::SparseMatrix&& matrix) override; - - virtual bool internalSolveEquations(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const& b) const override; - - virtual void repeatedMultiply(Environment const& env, OptimizationDirection dir, std::vector& x, std::vector const* b, uint_fast64_t n) const override; - - private: - storm::storage::SparseMatrix const* A; - std::unique_ptr> localA; - - bool enableCuda; - /*! - * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. - */ - std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; - }; - - template - bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const, double const, bool const, std::vector const&, std::vector> const&, std::vector& x, std::vector const&, std::vector const&, size_t&) { - // - STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments."); - } - template <> - inline bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - - (void)maxIterationCount; - (void)precision; - (void)relativePrecisionCheck; - (void)matrixRowIndices; - (void)columnIndicesAndValues; - (void)x; - (void)b; - (void)nondeterministicChoiceIndices; - (void)iterationCount; - -#ifdef STORM_HAVE_CUDA - return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); -#else - STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); -#endif - } - template <> - inline bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - - (void)maxIterationCount; - (void)precision; - (void)relativePrecisionCheck; - (void)matrixRowIndices; - (void)columnIndicesAndValues; - (void)x; - (void)b; - (void)nondeterministicChoiceIndices; - (void)iterationCount; - -#ifdef STORM_HAVE_CUDA - return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); -#else - STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); -#endif - } - - template - bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const, double const, bool const, std::vector const&, std::vector> const&, std::vector&, std::vector const&, std::vector const&, size_t&) { - STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments."); - } - template <> - inline bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - - (void)maxIterationCount; - (void)precision; - (void)relativePrecisionCheck; - (void)matrixRowIndices; - (void)columnIndicesAndValues; - (void)x; - (void)b; - (void)nondeterministicChoiceIndices; - (void)iterationCount; - -#ifdef STORM_HAVE_CUDA - return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); -#else - STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); -#endif - } - template <> - inline bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - - (void)maxIterationCount; - (void)precision; - (void)relativePrecisionCheck; - (void)matrixRowIndices; - (void)columnIndicesAndValues; - (void)x; - (void)b; - (void)nondeterministicChoiceIndices; - (void)iterationCount; - -#ifdef STORM_HAVE_CUDA - return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); -#else - STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "Storm is compiled without CUDA support."); -#endif - } - - template - class TopologicalMinMaxLinearEquationSolverFactory : public MinMaxLinearEquationSolverFactory { - public: - TopologicalMinMaxLinearEquationSolverFactory(bool trackScheduler = false); - - protected: - virtual std::unique_ptr> create(Environment const& env) const override; - }; - - } // namespace solver -} // namespace storm - -#endif /* STORM_SOLVER_TOPOLOGICALVALUEITERATIONMINMAXLINEAREQUATIONSOLVER_H_ */ diff --git a/src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp b/src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp index 5da2067e5..3caf14699 100644 --- a/src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp +++ b/src/test/storm/solver/MinMaxLinearEquationSolverTest.cpp @@ -45,6 +45,17 @@ namespace { return env; } }; + class DoubleTopologicalCudaViEnvironment { + public: + typedef double ValueType; + static const bool isExact = false; + static storm::Environment createEnvironment() { + storm::Environment env; + env.solver().minMax().setMethod(storm::solver::MinMaxMethod::TopologicalCuda); + env.solver().minMax().setPrecision(storm::utility::convertNumber(1e-8)); + return env; + } + }; class DoublePIEnvironment { public: typedef double ValueType; @@ -96,6 +107,7 @@ namespace { DoubleViEnvironment, DoubleSoundViEnvironment, DoubleTopologicalViEnvironment, + DoubleTopologicalCudaViEnvironment, DoublePIEnvironment, RationalPIEnvironment, RationalRationalSearchEnvironment