#include "src/solver/TopologicalMinMaxLinearEquationSolver.h" #include #include #include "src/settings/SettingsManager.h" #include "src/utility/vector.h" #include "src/utility/graph.h" #include "src/storage/StronglyConnectedComponentDecomposition.h" #include "src/exceptions/IllegalArgumentException.h" #include "src/exceptions/InvalidStateException.h" #include "log4cplus/logger.h" #include "log4cplus/loggingmacros.h" extern log4cplus::Logger logger; #include "storm-config.h" #ifdef STORM_HAVE_CUDA # include "cudaForStorm.h" #endif namespace storm { namespace solver { template TopologicalMinMaxLinearEquationSolver::TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A) : NativeMinMaxLinearEquationSolver(A) { // Get the settings object to customize solving. //storm::settings::Settings* settings = storm::settings::Settings::getInstance(); auto settings = storm::settings::topologicalValueIterationEquationSolverSettings(); // Get appropriate settings. //this->maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); //this->precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); //this->relative = !settings->isSet("absolute"); this->maximalNumberOfIterations = settings.getMaximalIterationCount(); this->precision = settings.getPrecision(); this->relative = (settings.getConvergenceCriterion() == storm::settings::modules::TopologicalValueIterationEquationSolverSettings::ConvergenceCriterion::Relative); auto generalSettings = storm::settings::generalSettings(); this->enableCuda = generalSettings.isCudaSet(); #ifdef STORM_HAVE_CUDA STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways."); #endif } template TopologicalMinMaxLinearEquationSolver::TopologicalMinMaxLinearEquationSolver(storm::storage::SparseMatrix const& A, double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeMinMaxLinearEquationSolver(A, precision, maximalNumberOfIterations, relative) { // Intentionally left empty. } template void TopologicalMinMaxLinearEquationSolver::solveEquationSystem(bool minimize, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { #ifdef GPU_USE_FLOAT #define __FORCE_FLOAT_CALCULATION true #else #define __FORCE_FLOAT_CALCULATION false #endif if (__FORCE_FLOAT_CALCULATION && std::is_same::value) { // FIXME: This actually allocates quite some storage, because of this conversion, is it really necessary? storm::storage::SparseMatrix newA = this->A.template toValueType(); TopologicalMinMaxLinearEquationSolver newSolver(newA, this->precision, this->maximalNumberOfIterations, this->relative); std::vector new_x = storm::utility::vector::toValueType(x); std::vector const new_b = storm::utility::vector::toValueType(b); newSolver.solveEquationSystem(minimize, new_x, new_b, nullptr, nullptr); for (size_t i = 0, size = new_x.size(); i < size; ++i) { x.at(i) = new_x.at(i); } return; } // For testing only if (sizeof(ValueType) == sizeof(double)) { //std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); } else { //std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); } // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = this->A.getRowGroupIndices(); // Check if the decomposition is necessary #ifdef STORM_HAVE_CUDA #define __USE_CUDAFORSTORM_OPT true size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A.getEntryCount())); size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); #else #define __USE_CUDAFORSTORM_OPT false size_t const gpuSizeOfCompleteSystem = 0; size_t const cudaFreeMemory = 0; #endif std::vector> sccDecomposition; if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { // Dummy output for SCC Times //std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; #ifdef STORM_HAVE_CUDA STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA Equation Solver."); std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); bool result = false; size_t globalIterations = 0; if (minimize) { result = __basicValueIteration_mvReduce_minimize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); } else { result = __basicValueIteration_mvReduce_maximize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); } LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); bool converged = false; if (!result) { converged = false; LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; } else { converged = true; } std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); //std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; //std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl; // Check if the solver converged and issue a warning otherwise. if (converged) { LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations."); } else { LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations."); } #else LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; #endif } else { std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now(); storm::storage::BitVector fullSystem(this->A.getRowGroupCount(), true); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(this->A, fullSystem, false, false); STORM_LOG_THROW(sccDecomposition.size() > 0, storm::exceptions::IllegalArgumentException, "Can not solve given equation system as the SCC decomposition returned no SCCs."); storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = sccDecomposition.extractPartitionDependencyGraph(this->A); std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); // Calculate the optimal distribution of sccs std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, this->A); LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now(); //std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast(sccEndTime - sccStartTime).count() << "ms." << std::endl; std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); std::vector* currentX = nullptr; std::vector* swap = nullptr; size_t currentMaxLocalIterations = 0; size_t localIterations = 0; size_t globalIterations = 0; bool converged = true; // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only // solved after all SCCs it depends on have been solved. int counter = 0; for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { bool const useGpu = sccIndexIt->first; storm::storage::StateBlock const& scc = sccIndexIt->second; // Generate a sub matrix storm::storage::BitVector subMatrixIndices(this->A.getColumnCount(), scc.cbegin(), scc.cend()); storm::storage::SparseMatrix sccSubmatrix = this->A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); std::vector sccSubB(sccSubmatrix.getRowCount()); storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); std::vector sccSubX(sccSubmatrix.getColumnCount()); std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); // Prepare the pointers for swapping in the calculation currentX = &sccSubX; swap = &sccSubXSwap; storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); sccSubNondeterministicChoiceIndices.at(0) = 0; // Pre-process all dependent states // Remove outgoing transitions and create the ChoiceIndices uint_fast64_t innerIndex = 0; uint_fast64_t outerIndex = 0; for (uint_fast64_t state : scc) { // Choice Indices sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { typename storm::storage::SparseMatrix::const_rows row = this->A.getRow(rowGroupIt); for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { if (!subMatrixIndices.get(rowIt->getColumn())) { // This is an outgoing transition of a state in the SCC to a state not included in the SCC // Subtracting Pr(tau) * x_other from b fixes that sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); } } ++innerIndex; } ++outerIndex; } // For the current SCC, we need to perform value iteration until convergence. if (useGpu) { #ifdef STORM_HAVE_CUDA STORM_LOG_THROW(resetCudaDevice(), storm::exceptions::InvalidStateException, "Could not reset CUDA Device, can not use CUDA-based equation solver."); //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); bool result = false; localIterations = 0; if (minimize) { result = __basicValueIteration_mvReduce_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); } else { result = __basicValueIteration_mvReduce_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); } LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); if (!result) { converged = false; LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; } else { converged = true; } // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep // track of the maximum. if (localIterations > currentMaxLocalIterations) { currentMaxLocalIterations = localIterations; } globalIterations += localIterations; #else LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; #endif } else { //std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; LOG4CPLUS_INFO(logger, "Performance Warning: Using CPU based TopoSolver! (double)"); localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { // Compute x' = A*x + b. sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); storm::utility::vector::addVectors(sccMultiplyResult, sccSubB, sccMultiplyResult); //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); /* Versus: A.multiplyWithVector(*currentX, *multiplyResult); storm::utility::vector::addVectorsInPlace(*multiplyResult, b); */ // Reduce the vector x' by applying min/max for all non-deterministic choices. if (minimize) { storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); } else { storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); } // Determine whether the method converged. // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher // running time. In fact, it is faster. This has to be investigated. // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, static_cast(this->precision), this->relative); // Update environment variables. std::swap(currentX, swap); ++localIterations; ++globalIterations; } LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << this->maximalNumberOfIterations << " Iterations."); } // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; for (uint_fast64_t state : scc) { x.at(state) = currentX->at(innerIndex); ++innerIndex; } // Since the pointers for swapping in the calculation point to temps they should not be valid anymore currentX = nullptr; swap = nullptr; // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep // track of the maximum. if (localIterations > currentMaxLocalIterations) { currentMaxLocalIterations = localIterations; } } //std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; // Check if the solver converged and issue a warning otherwise. if (converged) { LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); } else { LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); //std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; } } template std::vector> TopologicalMinMaxLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { std::vector> result; #ifdef STORM_HAVE_CUDA // 95% to have a bit of padding size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); size_t lastResultIndex = 0; std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; size_t currentSize = 0; size_t neededReserveSize = 0; size_t startIndex = 0; for (size_t i = 0; i < topologicalSort.size(); ++i) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; size_t const currentSccSize = scc.size(); uint_fast64_t rowCount = 0; uint_fast64_t entryCount = 0; for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { rowCount += matrix.getRowGroupSize(*sccIt); entryCount += matrix.getRowGroupEntryCount(*sccIt); } size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); if ((currentSize + sccSize) <= cudaFreeMemory) { // There is enough space left in the current group neededReserveSize += currentSccSize; currentSize += sccSize; } else { // This would make the last open group to big for the GPU if (startIndex < i) { if ((startIndex + 1) < i) { // More than one component std::vector tempGroups; tempGroups.reserve(neededReserveSize); // Copy the first group to make inplace_merge possible storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); if (((startIndex + 1) + 80) >= i) { size_t lastSize = 0; for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; lastSize = tempGroups.size(); tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); std::vector::iterator middleIterator = tempGroups.begin(); std::advance(middleIterator, lastSize); std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); } } else { // Use std::sort for (size_t j = startIndex + 1; j < i; ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); } std::sort(tempGroups.begin(), tempGroups.end()); } result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); } else { // Only one group, copy construct. result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); } ++lastResultIndex; } if (sccSize <= cudaFreeMemory) { currentSize = sccSize; neededReserveSize = currentSccSize; startIndex = i; } else { // This group is too big to fit into the CUDA Memory by itself result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); ++lastResultIndex; currentSize = 0; neededReserveSize = 0; startIndex = i + 1; } } } size_t const topologicalSortSize = topologicalSort.size(); if (startIndex < topologicalSortSize) { if ((startIndex + 1) < topologicalSortSize) { // More than one component std::vector tempGroups; tempGroups.reserve(neededReserveSize); // Copy the first group to make inplace_merge possible. storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); // For set counts <= 80, in-place merge is faster. if (((startIndex + 1) + 80) >= topologicalSortSize) { size_t lastSize = 0; for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; lastSize = tempGroups.size(); tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); std::vector::iterator middleIterator = tempGroups.begin(); std::advance(middleIterator, lastSize); std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); } } else { // Use std::sort for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); } std::sort(tempGroups.begin(), tempGroups.end()); } result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend()))); } else { // Only one group, copy construct. result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); } ++lastResultIndex; } #else for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; result.push_back(std::make_pair(false, scc)); } #endif return result; } // Explicitly instantiate the solver. template class TopologicalMinMaxLinearEquationSolver; template class TopologicalMinMaxLinearEquationSolver; } // namespace solver } // namespace storm