diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h index eee59b007..65eb47bdd 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -65,7 +65,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y) +storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) { __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; @@ -135,7 +135,7 @@ storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const template __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) __global__ void -storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer) +storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) { __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index c3e543433..65451d743 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,6 +1,7 @@ #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include +#include #include "src/settings/Settings.h" #include "src/utility/vector.h" @@ -42,386 +43,283 @@ namespace storm { NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); } + + template + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + +#ifdef GPU_USE_FLOAT +#define __FORCE_FLOAT_CALCULATION true +#else +#define __FORCE_FLOAT_CALCULATION false +#endif + if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) { + TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); - template<> - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - // For testing only - std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; - LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); + storm::storage::SparseMatrix new_A = A.toValueType(); + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); - // Now, we need to determine the SCCs of the MDP and perform a topological sort. - std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); - storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); - - if (sccDecomposition.size() == 0) { - LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); - throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; - } + tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Calculate the optimal distribution of sccs - std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); - LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); - - std::vector* currentX = nullptr; - std::vector* swap = nullptr; - size_t currentMaxLocalIterations = 0; - size_t localIterations = 0; - size_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - int counter = 0; - - for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { - bool const useGpu = sccIndexIt->first; - storm::storage::StateBlock const& scc = sccIndexIt->second; - - // Generate a sub matrix - storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); - std::vector sccSubB(sccSubmatrix.getRowCount()); - storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); - std::vector sccSubX(sccSubmatrix.getColumnCount()); - std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); - - // Prepare the pointers for swapping in the calculation - currentX = &sccSubX; - swap = &sccSubXSwap; - - storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) - std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); - sccSubNondeterministicChoiceIndices.at(0) = 0; - - // Pre-process all dependent states - // Remove outgoing transitions and create the ChoiceIndices - uint_fast64_t innerIndex = 0; - uint_fast64_t outerIndex = 0; - for (uint_fast64_t state : scc) { - // Choice Indices - sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); - - for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); - for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->getColumn())) { - // This is an outgoing transition of a state in the SCC to a state not included in the SCC - // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); - } - } - ++innerIndex; - } - ++outerIndex; + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); } + return; + } + + // For testing only + if (sizeof(ValueType) == sizeof(double)) { + std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); + } else { + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); + } - // For the current SCC, we need to perform value iteration until convergence. - if (useGpu) { + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); + storm::models::NonDeterministicMatrixBasedPseudoModel const pseudoModel(A, nondeterministicChoiceIndices); + + // Check if the decomposition is necessary #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } - - //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - - bool result = false; - localIterations = 0; - if (minimize) { - result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } else { - result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - if (!result) { - converged = false; - LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - +#define __USE_CUDAFORSTORM_OPT true + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A.getEntryCount())); + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); #else - LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#define __USE_CUDAFORSTORM_OPT false + size_t const gpuSizeOfCompleteSystem = 0; + size_t const cudaFreeMemory = 0; #endif - } else { - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + std::vector> sccDecomposition; + if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { + // Dummy output for SCC Times + std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - - // Update environment variables. - std::swap(currentX, swap); - - ++localIterations; - ++globalIterations; - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; } - - // The Result of this SCC has to be taken back into the main result vector - innerIndex = 0; - for (uint_fast64_t state : scc) { - x.at(state) = currentX->at(innerIndex); - ++innerIndex; + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + bool result = false; + size_t globalIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); } + LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - // Since the pointers for swapping in the calculation point to temps they should not be valid anymore - currentX = nullptr; - swap = nullptr; - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; + bool converged = false; + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; } - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); - } - } - - template - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - -#ifndef GPU_USE_DOUBLE - TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); - storm::storage::SparseMatrix new_A = A.toValueType(); - std::vector new_x = storm::utility::vector::toValueType(x); - std::vector const new_b = storm::utility::vector::toValueType(b); + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; - tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl; - for (size_t i = 0, size = new_x.size(); i < size; ++i) { - x.at(i) = new_x.at(i); - } + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations."); + } #else - // For testing only - std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; - LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); - - // Now, we need to determine the SCCs of the MDP and perform a topological sort. - std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); - storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now(); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); - if (sccDecomposition.size() == 0) { - LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); - throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; - } + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Calculate the optimal distribution of sccs - std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); - LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); - - std::vector* currentX = nullptr; - std::vector* swap = nullptr; - size_t currentMaxLocalIterations = 0; - size_t localIterations = 0; - size_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - int counter = 0; - - for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { - bool const useGpu = sccIndexIt->first; - storm::storage::StateBlock const& scc = sccIndexIt->second; - - // Generate a sub matrix - storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); - std::vector sccSubB(sccSubmatrix.getRowCount()); - storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); - std::vector sccSubX(sccSubmatrix.getColumnCount()); - std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); - - // Prepare the pointers for swapping in the calculation - currentX = &sccSubX; - swap = &sccSubXSwap; - - storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) - std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); - sccSubNondeterministicChoiceIndices.at(0) = 0; - - // Pre-process all dependent states - // Remove outgoing transitions and create the ChoiceIndices - uint_fast64_t innerIndex = 0; - uint_fast64_t outerIndex = 0; - for (uint_fast64_t state: scc) { - // Choice Indices - sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); - - for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); - for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->getColumn())) { - // This is an outgoing transition of a state in the SCC to a state not included in the SCC - // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast(sccEndTime - sccStartTime).count() << "ms." << std::endl; + + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + int counter = 0; + + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } } + ++innerIndex; } - ++innerIndex; + ++outerIndex; } - ++outerIndex; - } - // For the current SCC, we need to perform value iteration until convergence. - if (useGpu) { + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } - //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - bool result = false; - localIterations = 0; - if (minimize) { - result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } else { - result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - if (!result) { - converged = false; - LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } + bool result = false; + localIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + globalIterations += localIterations; #else - LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; #endif - } else { - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + } else { + std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ + // Update environment variables. + std::swap(currentX, swap); - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + ++localIterations; + ++globalIterations; } - else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + } - // Update environment variables. - std::swap(currentX, swap); - ++localIterations; - ++globalIterations; + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); - } + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; - // The Result of this SCC has to be taken back into the main result vector - innerIndex = 0; - for (uint_fast64_t state: scc) { - x.at(state) = currentX->at(innerIndex); - ++innerIndex; + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } } - // Since the pointers for swapping in the calculation point to temps they should not be valid anymore - currentX = nullptr; - swap = nullptr; + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } - } - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } - else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; } -#endif } template diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 40d9df354..bdbf788cd 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -8,6 +8,11 @@ #include #include +#include "storm-config.h" +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif + namespace storm { namespace solver { @@ -42,6 +47,50 @@ namespace storm { */ std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; }; + + template + bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + + template + bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } } // namespace solver } // namespace storm