diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index d3c412ddf..ff93d29b6 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -35,7 +35,7 @@ __host__ __device__ T operator()(const T &x, const T &y) const { if (Relative) { if (y == 0) { - return x; + return ((x >= 0) ? (x) : (-x)); } const T result = (x - y) / y; return ((result >= 0) ? (result) : (-result)); @@ -73,6 +73,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #ifdef DEBUG std::cout.sync_with_stdio(true); + std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; @@ -143,6 +144,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished allocating memory." << std::endl; +#endif + // Memory allocated, copy data to device cudaError_t cudaCopyResult; @@ -204,6 +209,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; +#endif + // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization @@ -239,6 +248,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy std::swap(device_x, device_xSwap); } #ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; #endif @@ -249,6 +259,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished copying result data." << std::endl; +#endif + // All code related to freeing memory and clearing up the device cleanup: if (device_matrixRowIndices != nullptr) { @@ -307,6 +321,9 @@ cleanup: } device_nondeterministicChoiceIndices = nullptr; } +#ifdef DEBUG + std::cout << "(DLL) Finished cleanup." << std::endl; +#endif } template @@ -323,6 +340,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; @@ -368,6 +386,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); +#ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; +#endif + // Get result back from the device cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); if (cudaCopyResult != cudaSuccess) { @@ -417,6 +447,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector @@ -730,4 +767,27 @@ void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max } else { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); } +} + +size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(double); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + + return (matrixDataSize + vectorSizes); } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index ad4ce0fc9..f23cbec28 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -8,6 +8,7 @@ // Library exports #include "cudaForStorm_Export.h" +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index f50e8caf0..6807a77fd 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -42,12 +42,12 @@ namespace storm { } template - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult, std::vector* newX) const { + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { // Now, we need to determine the SCCs of the MDP and a topological sort. //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); - + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); @@ -60,6 +60,9 @@ namespace storm { storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + // Calculate the optimal distribution of sccs + std::vector>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + // Set up the environment for the power method. // bool multiplyResultMemoryProvided = true; // if (multiplyResult == nullptr) { @@ -82,12 +85,13 @@ namespace storm { // solved after all SCCs it depends on have been solved. int counter = 0; - for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + std::vector const& scc = sccIndexIt->second; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); std::vector sccSubB(sccSubmatrix.getRowCount()); storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); std::vector sccSubX(sccSubmatrix.getColumnCount()); @@ -125,108 +129,115 @@ namespace storm { } // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } - - LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - - std::vector copyX(*currentX); - if (minimize) { - basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); - } - else { - basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); - } - - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ + LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - // Reduce the vector x' by applying min/max for all non-deterministic choices. + std::vector copyX(*currentX); if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); } else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); } + converged = true; + + // DEBUG + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - - // Update environment variables. - std::swap(currentX, swap); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - ++localIterations; - ++globalIterations; - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + // Update environment variables. + std::swap(currentX, swap); - uint_fast64_t diffCount = 0; - for (size_t i = 0; i < currentX->size(); ++i) { - if (currentX->at(i) != copyX.at(i)) { - LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); - std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; - } - } -#else - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); - - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + ++localIterations; + ++globalIterations; } - else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + + uint_fast64_t diffCount = 0; + for (size_t i = 0; i < currentX->size(); ++i) { + if (currentX->at(i) != copyX.at(i)) { + LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); + std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; + ++diffCount; + } } + std::cout << "CUDA solution differed in " << diffCount << " of " << currentX->size() << " values." << std::endl; +#endif + } else { + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - // Update environment variables. - std::swap(currentX, swap); + // Update environment variables. + std::swap(currentX, swap); - ++localIterations; - ++globalIterations; + ++localIterations; + ++globalIterations; + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); -#endif + // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; @@ -263,6 +274,72 @@ namespace storm { } } + template + std::vector>> + TopologicalValueIterationNondeterministicLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { + std::vector>> result; +#ifdef STORM_HAVE_CUDAFORSTORM + // 95% to have a bit of padding + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); + size_t lastResultIndex = 0; + + std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + size_t currentSize = 0; + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + + uint_fast64_t rowCount = 0; + uint_fast64_t entryCount = 0; + std::vector rowGroups; + rowGroups.reserve(scc.size()); + + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowCount += matrix.getRowGroupSize(*sccIt); + entryCount += matrix.getRowGroupEntryCount(*sccIt); + rowGroups.push_back(*sccIt); + } + + size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); + + if ((currentSize + sccSize) <= cudaFreeMemory) { + // There is enough space left in the current group + + if (currentSize == 0) { + result.push_back(std::make_pair(true, rowGroups)); + } + else { + result[lastResultIndex].second.insert(result[lastResultIndex].second.end(), rowGroups.begin(), rowGroups.end()); + } + currentSize += sccSize; + } + else { + if (sccSize <= cudaFreeMemory) { + ++lastResultIndex; + result.push_back(std::make_pair(true, rowGroups)); + currentSize = sccSize; + } + else { + // This group is too big to fit into the CUDA Memory by itself + lastResultIndex += 2; + result.push_back(std::make_pair(false, rowGroups)); + currentSize = 0; + } + } + } +#else + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + std::vector rowGroups; + rowGroups.reserve(scc.size()); + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowGroups.push_back(*sccIt); + result.push_back(std::make_pair(false, rowGroups)); + } + } +#endif + return result; + } + // Explicitly instantiate the solver. template class TopologicalValueIterationNondeterministicLinearEquationSolver; } // namespace solver diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 86262e062..84aba15fe 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -2,6 +2,11 @@ #define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ #include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/storage/StronglyConnectedComponentDecomposition.h" +#include "src/storage/SparseMatrix.h" + +#include +#include namespace storm { namespace solver { @@ -30,7 +35,12 @@ namespace storm { virtual NondeterministicLinearEquationSolver* clone() const override; - virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + private: + /*! + * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. + */ + std::vector>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; }; } // namespace solver } // namespace storm diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index 69100e6bb..3aee26996 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -330,6 +330,15 @@ namespace storm { return entryCount; } + template + uint_fast64_t SparseMatrix::getRowGroupEntryCount(uint_fast64_t const group) const { + uint_fast64_t result = 0; + for (uint_fast64_t row = this->getRowGroupIndices()[group]; row < this->getRowGroupIndices()[group + 1]; ++row) { + result += (this->rowIndications[row + 1] - this->rowIndications[row]); + } + return result; + } + template uint_fast64_t SparseMatrix::getRowGroupCount() const { return rowGroupIndices.size() - 1; diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index f55b55b90..ae3a87abd 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -344,6 +344,13 @@ namespace storm { * @return The number of entries in the matrix. */ uint_fast64_t getEntryCount() const; + + /*! + * Returns the number of entries in the given row group of the matrix. + * + * @return The number of entries in the given row group of the matrix. + */ + uint_fast64_t getRowGroupEntryCount(uint_fast64_t const group) const; /*! * Returns the number of row groups in the matrix.