From 64891af7854e0136c6d0ff72f31a9ac974889a39 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 3 Feb 2014 04:21:35 +0100 Subject: [PATCH 01/43] Trying to refurbish the TopologicalValueIterationMdpPrctlModelChecker Former-commit-id: 2963c774b04a9b8e3afa1f4b8f30d9563884c9a6 --- ...ogicalValueIterationMdpPrctlModelChecker.h | 2 +- ...tiveNondeterministicLinearEquationSolver.h | 2 +- ...tiveNondeterministicLinearEquationSolver.h | 39 +++ ...onNondeterministicLinearEquationSolver.cpp | 197 +++++++++++++++ ...ValueIterationMdpPrctlModelCheckerTest.cpp | 224 ++++++++++++++++++ 5 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h create mode 100644 src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp create mode 100644 test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 77fa27543..84bc1764d 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -37,7 +37,7 @@ public: * Copy constructs a SparseMdpPrctlModelChecker from the given model checker. In particular, this means that the newly * constructed model checker will have the model of the given model checker as its associated model. */ - explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) + explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) : SparseMdpPrctlModelChecker(modelchecker), minimumOperatorStack() { // Intentionally left empty. } diff --git a/src/solver/NativeNondeterministicLinearEquationSolver.h b/src/solver/NativeNondeterministicLinearEquationSolver.h index 51c8ddd97..64b361acb 100644 --- a/src/solver/NativeNondeterministicLinearEquationSolver.h +++ b/src/solver/NativeNondeterministicLinearEquationSolver.h @@ -34,7 +34,7 @@ namespace storm { virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; - private: + protected: // The required precision for the iterative methods. double precision; diff --git a/src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h new file mode 100644 index 000000000..2dfc0206d --- /dev/null +++ b/src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h @@ -0,0 +1,39 @@ +#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ +#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ + +#include "src/solver/NondeterministicLinearEquationSolver.h" +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" + +namespace storm { + namespace solver { + + /*! + * A class that uses SCC Decompositions to solve a linear equation system + */ + template + class TopologicalValueIterationNondeterministicLinearEquationSolver : public NondeterministicLinearEquationSolver { + public: + /*! + * Constructs a nondeterministic linear equation solver with parameters being set according to the settings + * object. + */ + TopologicalValueIterationNondeterministicLinearEquationSolver(); + + /*! + * Constructs a nondeterminstic linear equation solver with the given parameters. + * + * @param precision The precision to use for convergence detection. + * @param maximalNumberOfIterations The maximal number of iterations do perform before iteration is aborted. + * @param relative If set, the relative error rather than the absolute error is considered for convergence + * detection. + */ + TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative = true); + + virtual NondeterministicLinearEquationSolver* clone() const override; + + virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + }; + } // namespace solver +} // namespace storm + +#endif /* STORM_SOLVER_NATIVENONDETERMINISTICLINEAREQUATIONSOLVER_H_ */ diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp new file mode 100644 index 000000000..4b90dbb40 --- /dev/null +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -0,0 +1,197 @@ +#include "src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h" + +#include + +#include "src/settings/Settings.h" +#include "src/utility/vector.h" + +namespace storm { + namespace solver { + + template + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver() { + // // Intentionally left empty. + } + + template + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { + // Intentionally left empty. + } + + template + TopologicalValueIterationNondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { + return new NativeNondeterministicLinearEquationSolver(*this); + } + + template + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult, std::vector* newX) const { + + // Now, we need to determine the SCCs of the MDP and a topological sort. + std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Set up the environment for the power method. + bool multiplyResultMemoryProvided = true; + if (multiplyResult == nullptr) { + multiplyResult = new std::vector(A.getRowCount()); + multiplyResultMemoryProvided = false; + } + std::vector* currentX = &x; + bool xMemoryProvided = true; + if (newX == nullptr) { + newX = new std::vector(x.size()); + xMemoryProvided = false; + } + std::vector* swap = nullptr; + uint_fast64_t currentMaxLocalIterations = 0; + uint_fast64_t localIterations = 0; + uint_fast64_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { + std::vector const& scc = stronglyConnectedComponents[*sccIndexIt]; + + // For the current SCC, we need to perform value iteration until convergence. + localIterations = 0; + converged = false; + while (!converged && localIterations < maxIterations) { + // Compute x' = A*x + b. + matrix.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::reduceVectorMin(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + } + else { + storm::utility::reduceVectorMax(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + } + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::equalModuloPrecision(*currentX, *newX, precision, relative); + + // Update environment variables. + swap = currentX; + currentX = newX; + newX = swap; + ++localIterations; + ++globalIterations; + } + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + } + + // If we performed an odd number of global iterations, we need to swap the x and currentX, because the newest + // result is currently stored in currentX, but x is the output vector. + // TODO: Check whether this is correct or should be put into the for-loop over SCCs. + if (globalIterations % 2 == 1) { + std::swap(x, *currentX); + } + + if (!xMemoryProvided) { + delete copyX; + } + + if (!multiplyResultMemoryProvided) { + delete multiplyResult; + } + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } + else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + } + + + /* + + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + */ + + // Set up the environment for the power method. If scratch memory was not provided, we need to create it. + bool multiplyResultMemoryProvided = true; + if (multiplyResult == nullptr) { + multiplyResult = new std::vector(A.getRowCount()); + multiplyResultMemoryProvided = false; + } + std::vector* currentX = &x; + bool xMemoryProvided = true; + if (newX == nullptr) { + newX = new std::vector(x.size()); + xMemoryProvided = false; + } + std::vector* swap = nullptr; + uint_fast64_t iterations = 0; + bool converged = false; + + // Keep track of which of the vectors for x is the auxiliary copy. + std::vector* copyX = newX; + + // Proceed with the iterations as long as the method did not converge or reach the + // user-specified maximum number of iterations. + while (!converged && iterations < maximalNumberOfIterations) { + // Compute x' = A*x + b. + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + + // Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost + // element of the min/max operator stack. + if (minimize) { + storm::utility::vector::reduceVectorMin(*multiplyResult, *newX, nondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(*multiplyResult, *newX, nondeterministicChoiceIndices); + } + + // Determine whether the method converged. + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); + + // Update environment variables. + std::swap(currentX, newX); + ++iterations; + } + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << iterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converge after " << iterations << " iterations."); + } + + // If we performed an odd number of iterations, we need to swap the x and currentX, because the newest result + // is currently stored in currentX, but x is the output vector. + if (currentX == copyX) { + std::swap(x, *currentX); + } + + if (!xMemoryProvided) { + delete copyX; + } + + if (!multiplyResultMemoryProvided) { + delete multiplyResult; + } + } + + // Explicitly instantiate the solver. + template class TopologicalValueIterationNondeterministicLinearEquationSolver; + } // namespace solver +} // namespace storm diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp new file mode 100644 index 000000000..7ccb05369 --- /dev/null +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -0,0 +1,224 @@ +#include "gtest/gtest.h" +#include "storm-config.h" + +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/settings/Settings.h" +#include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" +#include "src/parser/AutoParser.h" + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); + + ASSERT_EQ(parser.getType(), storm::models::MDP); + + std::shared_ptr> mdp = parser.getModel>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 169ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("two"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("three"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("three"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("four"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("four"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("done"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula);; + + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + storm::parser::AutoParser stateRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", ""); + + ASSERT_EQ(stateRewardParser.getType(), storm::models::MDP); + + std::shared_ptr> stateRewardMdp = stateRewardParser.getModel>(); + + storm::modelchecker::prctl::SparseMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp, std::shared_ptr>(new storm::solver::NativeNondeterministicLinearEquationSolver())); + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + storm::parser::AutoParser stateAndTransitionRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); + + ASSERT_EQ(stateAndTransitionRewardParser.getType(), storm::models::MDP); + + std::shared_ptr> stateAndTransitionRewardMdp = stateAndTransitionRewardParser.getModel>(); + + storm::modelchecker::prctl::SparseMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp, std::shared_ptr>(new storm::solver::NativeNondeterministicLinearEquationSolver())); + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew"); + + ASSERT_EQ(parser.getType(), storm::models::MDP); + + std::shared_ptr> mdp = parser.getModel>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 3172ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula);; + + ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula);; + + ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} From 57b6208eee6fe4830167d2e2dace44d7db4548b1 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 10 Feb 2014 00:42:59 +0100 Subject: [PATCH 02/43] Added a pseudo model which can be constructed from only a matrix to look and behave like a model for use in Decomposition classes Former-commit-id: f8fdc5a9b636837f60db1c57dfec2b2b405f421b --- ...ogicalValueIterationMdpPrctlModelChecker.h | 2 +- src/models/PseudoModel.cpp | 108 ++++++++++++++++++ src/models/PseudoModel.h | 90 +++++++++++++++ .../NondeterministicLinearEquationSolver.h | 2 +- ...onNondeterministicLinearEquationSolver.cpp | 105 ++++------------- ...ionNondeterministicLinearEquationSolver.h} | 3 +- src/storage/Decomposition.h | 5 +- ...tronglyConnectedComponentDecomposition.cpp | 37 ++++-- .../StronglyConnectedComponentDecomposition.h | 55 +++++++-- 9 files changed, 304 insertions(+), 103 deletions(-) create mode 100644 src/models/PseudoModel.cpp create mode 100644 src/models/PseudoModel.h rename src/solver/{TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h => TopologicalValueIterationNondeterministicLinearEquationSolver.h} (94%) diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 84bc1764d..750d068c8 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -68,7 +68,7 @@ private: bool relative = s->getOptionByLongName("relative").getArgument(0).getValueAsBoolean(); // Now, we need to determine the SCCs of the MDP and a topological sort. - std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); + std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(matrix, nondeterministicChoiceIndices, stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); diff --git a/src/models/PseudoModel.cpp b/src/models/PseudoModel.cpp new file mode 100644 index 000000000..37749a873 --- /dev/null +++ b/src/models/PseudoModel.cpp @@ -0,0 +1,108 @@ +#include "src/models/PseudoModel.h" +#include "src/utility/constants.h" +#include "src/models/AbstractModel.h" + +namespace storm { + namespace models { + + template + ModelBasedPseudoModel::ModelBasedPseudoModel(storm::models::AbstractModel const& model) : _model(model) { + // Intentionally left empty. + } + + template + NonDeterministicMatrixBasedPseudoModel::NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix, std::vector const& nondeterministicChoiceIndices) : _matrix(matrix), _nondeterministicChoiceIndices(nondeterministicChoiceIndices) { + // Intentionally left empty. + } + + template + DeterministicMatrixBasedPseudoModel::DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix) : _matrix(matrix) { + // Intentionally left empty. + } + + template + typename storm::storage::SparseMatrix::const_rows + ModelBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_model.getRows(state); + } + + template + typename storm::storage::SparseMatrix::const_rows + NonDeterministicMatrixBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_matrix.getRows(this->_nondeterministicChoiceIndices[state], this->_nondeterministicChoiceIndices[state + 1] - 1); + } + + template + typename storm::storage::SparseMatrix::const_rows + DeterministicMatrixBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_matrix.getRows(state, state); + } + + template + uint_fast64_t + ModelBasedPseudoModel::getNumberOfStates() const { + return this->_model.getNumberOfStates(); + } + + template + uint_fast64_t + NonDeterministicMatrixBasedPseudoModel::getNumberOfStates() const { + return this->_matrix.getColumnCount(); + } + + template + uint_fast64_t + DeterministicMatrixBasedPseudoModel::getNumberOfStates() const { + return this->_matrix.getColumnCount(); + } + + template + storm::storage::SparseMatrix + AbstractPseudoModel::extractPartitionDependencyGraph(storm::storage::Decomposition const& decomposition) const { + uint_fast64_t numberOfStates = decomposition.size(); + + // First, we need to create a mapping of states to their SCC index, to ease the computation of dependency transitions later. + std::vector stateToBlockMap(this->getNumberOfStates()); + for (uint_fast64_t i = 0; i < decomposition.size(); ++i) { + for (auto state : decomposition[i]) { + stateToBlockMap[state] = i; + } + } + + // The resulting sparse matrix will have as many rows/columns as there are blocks in the partition. + storm::storage::SparseMatrixBuilder dependencyGraphBuilder(numberOfStates, numberOfStates); + + for (uint_fast64_t currentBlockIndex = 0; currentBlockIndex < decomposition.size(); ++currentBlockIndex) { + // Get the next block. + typename storm::storage::StateBlock const& block = decomposition[currentBlockIndex]; + + // Now, we determine the blocks which are reachable (in one step) from the current block. + boost::container::flat_set allTargetBlocks; + for (auto state : block) { + for (auto const& transitionEntry : this->getRows(state)) { + uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.first]; + + // We only need to consider transitions that are actually leaving the SCC. + if (targetBlock != currentBlockIndex) { + allTargetBlocks.insert(targetBlock); + } + } + } + + // Now we can just enumerate all the target SCCs and insert the corresponding transitions. + for (auto targetBlock : allTargetBlocks) { + dependencyGraphBuilder.addNextValue(currentBlockIndex, targetBlock, storm::utility::constantOne()); + } + } + + return dependencyGraphBuilder.build(); + } + + template class ModelBasedPseudoModel; + template class NonDeterministicMatrixBasedPseudoModel; + template class DeterministicMatrixBasedPseudoModel; + template class ModelBasedPseudoModel; + template class NonDeterministicMatrixBasedPseudoModel; + template class DeterministicMatrixBasedPseudoModel; + } // namespace models +} // namespace storm \ No newline at end of file diff --git a/src/models/PseudoModel.h b/src/models/PseudoModel.h new file mode 100644 index 000000000..6d3fd38da --- /dev/null +++ b/src/models/PseudoModel.h @@ -0,0 +1,90 @@ +#ifndef STORM_MODELS_PSEUDOMODEL_H_ +#define STORM_MODELS_PSEUDOMODEL_H_ + +#include +#include "src/storage/SparseMatrix.h" +#include "src/storage/Decomposition.h" + +namespace storm { + namespace models { + // Forward declare the abstract model class. + template class AbstractModel; + + /*! + * This classes encapsulate the model/transitionmatrix on which the SCC decomposition is performed. + * The Abstract Base class is specialized by the two possible representations: + * - For a model the implementation ModelBasedPseudoModel hands the call to getRows() through to the model + * - For a matrix of a nondeterministic model the implementation NonDeterministicMatrixBasedPseudoModel emulates the call + * on the matrix itself like the model function would + * - For a matrix of a deterministic model the implementation DeterministicMatrixBasedPseudoModel emulates the call + * on the matrix itself like the model function would + */ + template + class AbstractPseudoModel { + public: + AbstractPseudoModel() {} + virtual ~AbstractPseudoModel() {} + + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const = 0; + + /*! + * Calculates the number of states in the represented system. + * @return The Number of States in the underlying model/transition matrix + */ + virtual uint_fast64_t getNumberOfStates() const = 0; + + /*! + * Extracts the dependency graph from a (pseudo-) model according to the given partition. + * + * @param decomposition A decomposition containing the blocks of the partition of the system. + * @return A sparse matrix with bool entries that represents the dependency graph of the blocks of the partition. + */ + virtual storm::storage::SparseMatrix extractPartitionDependencyGraph(storm::storage::Decomposition const& decomposition) const; + }; + + template + class ModelBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a model + * @param model The Model on which the decomposition is to be performed + */ + ModelBasedPseudoModel(storm::models::AbstractModel const& model); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::models::AbstractModel const& _model; + }; + + template + class NonDeterministicMatrixBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a matrix + * @param matrix The Matrix on which the decomposition is to be performed + */ + NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix, std::vector const& nondeterministicChoiceIndices); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::storage::SparseMatrix const& _matrix; + std::vector const& _nondeterministicChoiceIndices; + }; + + template + class DeterministicMatrixBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a matrix + * @param matrix The Matrix on which the decomposition is to be performed + */ + DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::storage::SparseMatrix const& _matrix; + }; + } +} + +#endif // STORM_MODELS_PSEUDOMODEL_H_ \ No newline at end of file diff --git a/src/solver/NondeterministicLinearEquationSolver.h b/src/solver/NondeterministicLinearEquationSolver.h index 8c5bc2334..a6beafcbe 100644 --- a/src/solver/NondeterministicLinearEquationSolver.h +++ b/src/solver/NondeterministicLinearEquationSolver.h @@ -40,7 +40,7 @@ namespace storm { * @return The solution vector x of the system of linear equations as the content of the parameter x. */ virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& rowGroupIndices, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const = 0; - + /*! * Performs (repeated) matrix-vector multiplication with the given parameters, i.e. computes * x[i+1] = min/max(A*x[i] + b) until x[n], where x[0] = x. After each multiplication and addition, the diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 4b90dbb40..e075e2ee6 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,34 +1,49 @@ -#include "src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h" +#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include #include "src/settings/Settings.h" #include "src/utility/vector.h" +#include "src/utility/graph.h" +#include "src/models/PseudoModel.h" +#include "src/storage/StronglyConnectedComponentDecomposition.h" namespace storm { namespace solver { template TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver() { - // // Intentionally left empty. + // Get the settings object to customize solving. + storm::settings::Settings* settings = storm::settings::Settings::getInstance(); + + // Get appropriate settings. + maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); + precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); + relative = !settings->isSet("absolute"); } template - TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { // Intentionally left empty. } template - TopologicalValueIterationNondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { - return new NativeNondeterministicLinearEquationSolver(*this); + NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { + return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); } template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult, std::vector* newX) const { // Now, we need to determine the SCCs of the MDP and a topological sort. - std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); + //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); + //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); + + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); // Set up the environment for the power method. @@ -52,14 +67,14 @@ namespace storm { // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only // solved after all SCCs it depends on have been solved. for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - std::vector const& scc = stronglyConnectedComponents[*sccIndexIt]; + std::vector const& scc = sccDecomposition[*sccIndexIt]; // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; converged = false; - while (!converged && localIterations < maxIterations) { + while (!converged && localIterations < maximalNumberOfIterations) { // Compute x' = A*x + b. - matrix.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); /* @@ -119,76 +134,6 @@ namespace storm { else { LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } - - - /* - - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - */ - - // Set up the environment for the power method. If scratch memory was not provided, we need to create it. - bool multiplyResultMemoryProvided = true; - if (multiplyResult == nullptr) { - multiplyResult = new std::vector(A.getRowCount()); - multiplyResultMemoryProvided = false; - } - std::vector* currentX = &x; - bool xMemoryProvided = true; - if (newX == nullptr) { - newX = new std::vector(x.size()); - xMemoryProvided = false; - } - std::vector* swap = nullptr; - uint_fast64_t iterations = 0; - bool converged = false; - - // Keep track of which of the vectors for x is the auxiliary copy. - std::vector* copyX = newX; - - // Proceed with the iterations as long as the method did not converge or reach the - // user-specified maximum number of iterations. - while (!converged && iterations < maximalNumberOfIterations) { - // Compute x' = A*x + b. - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - - // Reduce the vector x' by applying min/max for all non-deterministic choices as given by the topmost - // element of the min/max operator stack. - if (minimize) { - storm::utility::vector::reduceVectorMin(*multiplyResult, *newX, nondeterministicChoiceIndices); - } else { - storm::utility::vector::reduceVectorMax(*multiplyResult, *newX, nondeterministicChoiceIndices); - } - - // Determine whether the method converged. - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); - - // Update environment variables. - std::swap(currentX, newX); - ++iterations; - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << iterations << " iterations."); - } else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converge after " << iterations << " iterations."); - } - - // If we performed an odd number of iterations, we need to swap the x and currentX, because the newest result - // is currently stored in currentX, but x is the output vector. - if (currentX == copyX) { - std::swap(x, *currentX); - } - - if (!xMemoryProvided) { - delete copyX; - } - - if (!multiplyResultMemoryProvided) { - delete multiplyResult; - } } // Explicitly instantiate the solver. diff --git a/src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h similarity index 94% rename from src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h rename to src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 2dfc0206d..86262e062 100644 --- a/src/solver/TopologicalValueIterationNativeNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -1,7 +1,6 @@ #ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ #define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ -#include "src/solver/NondeterministicLinearEquationSolver.h" #include "src/solver/NativeNondeterministicLinearEquationSolver.h" namespace storm { @@ -11,7 +10,7 @@ namespace storm { * A class that uses SCC Decompositions to solve a linear equation system */ template - class TopologicalValueIterationNondeterministicLinearEquationSolver : public NondeterministicLinearEquationSolver { + class TopologicalValueIterationNondeterministicLinearEquationSolver : public NativeNondeterministicLinearEquationSolver { public: /*! * Constructs a nondeterministic linear equation solver with parameters being set according to the settings diff --git a/src/storage/Decomposition.h b/src/storage/Decomposition.h index 0c44943c9..5808e2500 100644 --- a/src/storage/Decomposition.h +++ b/src/storage/Decomposition.h @@ -8,7 +8,7 @@ namespace storm { namespace storage { // A typedef that specifies the type of a block consisting of states only. typedef boost::container::flat_set StateBlock; - + /*! * Writes a string representation of the state block to the given output stream. * @@ -129,11 +129,10 @@ namespace storm { * @return The block with the given index. */ Block& operator[](uint_fast64_t index); - + // Declare the streaming operator as a friend function to enable output of decompositions. template friend std::ostream& operator<<(std::ostream& out, Decomposition const& decomposition); - protected: // The blocks of the decomposition. std::vector blocks; diff --git a/src/storage/StronglyConnectedComponentDecomposition.cpp b/src/storage/StronglyConnectedComponentDecomposition.cpp index 6e9f08386..b46a5706f 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.cpp +++ b/src/storage/StronglyConnectedComponentDecomposition.cpp @@ -1,5 +1,6 @@ #include "src/storage/StronglyConnectedComponentDecomposition.h" #include "src/models/AbstractModel.h" +#include "src/models/PseudoModel.h" namespace storm { namespace storage { @@ -10,19 +11,37 @@ namespace storm { template StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, bool dropNaiveSccs, bool onlyBottomSccs) : Decomposition() { - performSccDecomposition(model, dropNaiveSccs, onlyBottomSccs); + performSccDecomposition(storm::models::ModelBasedPseudoModel(model), dropNaiveSccs, onlyBottomSccs); } + + template + StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, bool dropNaiveSccs, bool onlyBottomSccs) : Decomposition() { + performSccDecomposition(pseudoModel, dropNaiveSccs, onlyBottomSccs); + } - template + template StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, StateBlock const& block, bool dropNaiveSccs, bool onlyBottomSccs) { - storm::storage::BitVector subsystem(model.getNumberOfStates(), block.begin(), block.end()); - performSccDecomposition(model, subsystem, dropNaiveSccs, onlyBottomSccs); + storm::models::ModelBasedPseudoModel encapsulatedModel(model); + storm::storage::BitVector subsystem(encapsulatedModel.getNumberOfStates(), block.begin(), block.end()); + performSccDecomposition(*static_cast*>(&encapsulatedModel), subsystem, dropNaiveSccs, onlyBottomSccs); } + + template + StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, StateBlock const& block, bool dropNaiveSccs, bool onlyBottomSccs) { + storm::storage::BitVector subsystem(pseudoModel.getNumberOfStates(), block.begin(), block.end()); + performSccDecomposition(pseudoModel, subsystem, dropNaiveSccs, onlyBottomSccs); + } template StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs) { - performSccDecomposition(model, subsystem, dropNaiveSccs, onlyBottomSccs); + storm::models::ModelBasedPseudoModel encapsulatedModel(model); + performSccDecomposition(*static_cast*>(&encapsulatedModel), subsystem, dropNaiveSccs, onlyBottomSccs); } + + template + StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs) { + performSccDecomposition(pseudoModel, subsystem, dropNaiveSccs, onlyBottomSccs); + } template StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(StronglyConnectedComponentDecomposition const& other) : Decomposition(other) { @@ -47,7 +66,7 @@ namespace storm { } template - void StronglyConnectedComponentDecomposition::performSccDecomposition(storm::models::AbstractModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs) { + void StronglyConnectedComponentDecomposition::performSccDecomposition(storm::models::AbstractPseudoModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs) { // Set up the environment of Tarjan's algorithm. uint_fast64_t numberOfStates = model.getNumberOfStates(); std::vector tarjanStack; @@ -68,7 +87,7 @@ namespace storm { template - void StronglyConnectedComponentDecomposition::performSccDecomposition(storm::models::AbstractModel const& model, bool dropNaiveSccs, bool onlyBottomSccs) { + void StronglyConnectedComponentDecomposition::performSccDecomposition(storm::models::AbstractPseudoModel const& model, bool dropNaiveSccs, bool onlyBottomSccs) { // Prepare a block that contains all states for a call to the other overload of this function. storm::storage::BitVector fullSystem(model.getNumberOfStates(), true); @@ -77,7 +96,7 @@ namespace storm { } template - void StronglyConnectedComponentDecomposition::performSccDecompositionHelper(storm::models::AbstractModel const& model, uint_fast64_t startState, storm::storage::BitVector const& subsystem, uint_fast64_t& currentIndex, std::vector& stateIndices, std::vector& lowlinks, std::vector& tarjanStack, storm::storage::BitVector& tarjanStackStates, storm::storage::BitVector& visitedStates, bool dropNaiveSccs, bool onlyBottomSccs) { + void StronglyConnectedComponentDecomposition::performSccDecompositionHelper(storm::models::AbstractPseudoModel const& model, uint_fast64_t startState, storm::storage::BitVector const& subsystem, uint_fast64_t& currentIndex, std::vector& stateIndices, std::vector& lowlinks, std::vector& tarjanStack, storm::storage::BitVector& tarjanStackStates, storm::storage::BitVector& visitedStates, bool dropNaiveSccs, bool onlyBottomSccs) { // Create the stacks needed for turning the recursive formulation of Tarjan's algorithm into an iterative // version. In particular, we keep one stack for states and one stack for the iterators. The last one is not // strictly needed, but reduces iteration work when all successors of a particular state are considered. @@ -201,7 +220,7 @@ namespace storm { } } } - + // Explicitly instantiate the SCC decomposition. template class StronglyConnectedComponentDecomposition; } // namespace storage diff --git a/src/storage/StronglyConnectedComponentDecomposition.h b/src/storage/StronglyConnectedComponentDecomposition.h index a8ce0403c..091e3f0ff 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.h +++ b/src/storage/StronglyConnectedComponentDecomposition.h @@ -3,11 +3,15 @@ #include "src/storage/Decomposition.h" #include "src/storage/BitVector.h" +#include "src/storage/SparseMatrix.h" namespace storm { namespace models { // Forward declare the abstract model class. template class AbstractModel; + + // Forward declare the abstract pseudo-model class. + template class AbstractPseudoModel; } namespace storage { @@ -33,6 +37,17 @@ namespace storm { * leaving the SCC), are kept. */ StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, bool dropNaiveSccs = false, bool onlyBottomSccs = false); + + /* + * Creates an SCC decomposition of the given encapsulated model. + * + * @param pseudoModel The encapsulated model to decompose into SCCs. + * @param dropNaiveSccs A flag that indicates whether trivial SCCs (i.e. SCCs consisting of just one state + * without a self-loop) are to be kept in the decomposition. + * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of + * leaving the SCC), are kept. + */ + StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, bool dropNaiveSccs = false, bool onlyBottomSccs = false); /* * Creates an SCC decomposition of the given block in the given model. @@ -45,6 +60,18 @@ namespace storm { * leaving the SCC), are kept. */ StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, StateBlock const& block, bool dropNaiveSccs = false, bool onlyBottomSccs = false); + + /* + * Creates an SCC decomposition of the given block in the given encapsulated model. + * + * @param pseudoModel The encapsulated model whose block to decompose. + * @param block The block to decompose into SCCs. + * @param dropNaiveSccs A flag that indicates whether trivial SCCs (i.e. SCCs consisting of just one state + * without a self-loop) are to be kept in the decomposition. + * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of + * leaving the SCC), are kept. + */ + StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, StateBlock const& block, bool dropNaiveSccs = false, bool onlyBottomSccs = false); /* * Creates an SCC decomposition of the given subsystem in the given model. @@ -57,36 +84,50 @@ namespace storm { * leaving the SCC), are kept. */ StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs = false, bool onlyBottomSccs = false); + + /* + * Creates an SCC decomposition of the given subsystem in the given encapsulated model. + * + * @param pseudoModel The encapsulated model that contains the block. + * @param subsystem A bit vector indicating which subsystem to consider for the decomposition into SCCs. + * @param dropNaiveSccs A flag that indicates whether trivial SCCs (i.e. SCCs consisting of just one state + * without a self-loop) are to be kept in the decomposition. + * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of + * leaving the SCC), are kept. + */ + StronglyConnectedComponentDecomposition(storm::models::AbstractPseudoModel const& pseudoModel, storm::storage::BitVector const& subsystem, bool dropNaiveSccs = false, bool onlyBottomSccs = false); /*! * Creates an SCC decomposition by copying the given SCC decomposition. * - * @oaram other The SCC decomposition to copy. + * @param other The SCC decomposition to copy. */ StronglyConnectedComponentDecomposition(StronglyConnectedComponentDecomposition const& other); /*! * Assigns the contents of the given SCC decomposition to the current one by copying its contents. * - * @oaram other The SCC decomposition from which to copy-assign. + * @param other The SCC decomposition from which to copy-assign. */ StronglyConnectedComponentDecomposition& operator=(StronglyConnectedComponentDecomposition const& other); /*! * Creates an SCC decomposition by moving the given SCC decomposition. * - * @oaram other The SCC decomposition to move. + * @param other The SCC decomposition to move. */ StronglyConnectedComponentDecomposition(StronglyConnectedComponentDecomposition&& other); /*! * Assigns the contents of the given SCC decomposition to the current one by moving its contents. * - * @oaram other The SCC decomposition from which to copy-assign. + * @param other The SCC decomposition from which to copy-assign. */ StronglyConnectedComponentDecomposition& operator=(StronglyConnectedComponentDecomposition&& other); private: + + /*! * Performs the SCC decomposition of the given model. As a side-effect this fills the vector of * blocks of the decomposition. @@ -97,7 +138,7 @@ namespace storm { * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of * leaving the SCC), are kept. */ - void performSccDecomposition(storm::models::AbstractModel const& model, bool dropNaiveSccs, bool onlyBottomSccs); + void performSccDecomposition(storm::models::AbstractPseudoModel const& model, bool dropNaiveSccs, bool onlyBottomSccs); /* * Performs the SCC decomposition of the given block in the given model. As a side-effect this fills @@ -110,7 +151,7 @@ namespace storm { * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of * leaving the SCC), are kept. */ - void performSccDecomposition(storm::models::AbstractModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs); + void performSccDecomposition(storm::models::AbstractPseudoModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs); /*! * A helper function that performs the SCC decomposition given all auxiliary data structures. As a @@ -130,7 +171,7 @@ namespace storm { * @param onlyBottomSccs If set to true, only bottom SCCs, i.e. SCCs in which all states have no way of * leaving the SCC), are kept. */ - void performSccDecompositionHelper(storm::models::AbstractModel const& model, uint_fast64_t startState, storm::storage::BitVector const& subsystem, uint_fast64_t& currentIndex, std::vector& stateIndices, std::vector& lowlinks, std::vector& tarjanStack, storm::storage::BitVector& tarjanStackStates, storm::storage::BitVector& visitedStates, bool dropNaiveSccs, bool onlyBottomSccs); + void performSccDecompositionHelper(storm::models::AbstractPseudoModel const& model, uint_fast64_t startState, storm::storage::BitVector const& subsystem, uint_fast64_t& currentIndex, std::vector& stateIndices, std::vector& lowlinks, std::vector& tarjanStack, storm::storage::BitVector& tarjanStackStates, storm::storage::BitVector& visitedStates, bool dropNaiveSccs, bool onlyBottomSccs); }; } } From 4eef3b0d578d46046ec58f5da55221ff38e4345f Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 12 Feb 2014 02:33:27 +0100 Subject: [PATCH 03/43] Added an example for SCC related testing which will change soon Removed unnecessary code from the TopologicalValueIterationMdpPrctlModelChecker.h Fixed Bugs in graph.h (changes from Sparse Matrix Iterator, it didnt even compile anymore! Unused Code HAUNTS us) Former-commit-id: 96669adec97dfe1110bf967e32a12f7d9aab19c2 --- examples/mdp/scc/scc.pctl | 2 + ...ogicalValueIterationMdpPrctlModelChecker.h | 97 +------------------ ...onNondeterministicLinearEquationSolver.cpp | 22 +++-- src/utility/graph.h | 8 +- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 9 +- 5 files changed, 28 insertions(+), 110 deletions(-) create mode 100644 examples/mdp/scc/scc.pctl diff --git a/examples/mdp/scc/scc.pctl b/examples/mdp/scc/scc.pctl new file mode 100644 index 000000000..393670a26 --- /dev/null +++ b/examples/mdp/scc/scc.pctl @@ -0,0 +1,2 @@ +Pmin=? [ F a ] +Pmax=? [ F a ] \ No newline at end of file diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 750d068c8..7093f4cb3 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -9,6 +9,7 @@ #define STORM_MODELCHECKER_PRCTL_TOPOLOGICALVALUEITERATIONSMDPPRCTLMODELCHECKER_H_ #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include "src/exceptions/InvalidPropertyException.h" #include @@ -29,7 +30,7 @@ public: * * @param model The MDP to be checked. */ - explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp const& model) : SparseMdpPrctlModelChecker(model) { + explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp const& model) : SparseMdpPrctlModelChecker(model, std::shared_ptr>(new storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver())) { // Intentionally left empty. } @@ -46,100 +47,6 @@ public: * Virtual destructor. Needs to be virtual, because this class has virtual methods. */ virtual ~TopologicalValueIterationMdpPrctlModelChecker() { } - -private: - /*! - * Solves the given equation system under the given parameters using the power method. - * - * @param A The matrix A specifying the coefficients of the equations. - * @param x The vector x for which to solve the equations. The initial value of the elements of - * this vector are used as the initial guess and might thus influence performance and convergence. - * @param b The vector b specifying the values on the right-hand-sides of the equations. - * @return The solution of the system of linear equations in form of the elements of the vector - * x. - */ - void solveEquationSystem(storm::storage::SparseMatrix const& matrix, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) const { - // Get the settings object to customize solving. - storm::settings::Settings* s = storm::settings::Settings::getInstance(); - - // Get relevant user-defined settings for solving the equations. - double precision = s->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); - uint_fast64_t maxIterations = s->getOptionByLongName("maxIterations").getArgument(0).getValueAsUnsignedInteger(); - bool relative = s->getOptionByLongName("relative").getArgument(0).getValueAsBoolean(); - - // Now, we need to determine the SCCs of the MDP and a topological sort. - std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(matrix, nondeterministicChoiceIndices, stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Set up the environment for the power method. - std::vector multiplyResult(matrix.getRowCount()); - std::vector* currentX = &x; - std::vector* newX = new std::vector(x.size()); - std::vector* swap = nullptr; - uint_fast64_t currentMaxLocalIterations = 0; - uint_fast64_t localIterations = 0; - uint_fast64_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - std::vector const& scc = stronglyConnectedComponents[*sccIndexIt]; - - // For the current SCC, we need to perform value iteration until convergence. - localIterations = 0; - converged = false; - while (!converged && localIterations < maxIterations) { - // Compute x' = A*x + b. - matrix.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (this->minimumOperatorStack.top()) { - storm::utility::reduceVectorMin(multiplyResult, newX, scc, nondeterministicChoiceIndices); - } else { - storm::utility::reduceVectorMax(multiplyResult, newX, scc, nondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::equalModuloPrecision(*currentX, *newX, precision, relative); - - // Update environment variables. - swap = currentX; - currentX = newX; - newX = swap; - ++localIterations; - ++globalIterations; - } - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - } - - // If we performed an odd number of global iterations, we need to swap the x and currentX, because the newest - // result is currently stored in currentX, but x is the output vector. - // TODO: Check whether this is correct or should be put into the for-loop over SCCs. - if (globalIterations % 2 == 1) { - std::swap(x, *currentX); - delete currentX; - } else { - delete newX; - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converge."); - } - } }; } // namespace prctl diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index e075e2ee6..6ff0310ba 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -66,16 +66,24 @@ namespace storm { // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only // solved after all SCCs it depends on have been solved. + int counter = 0; + std::cout << "Solving Equation System using the TopologicalValueIterationNon..." << std::endl; for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - std::vector const& scc = sccDecomposition[*sccIndexIt]; + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + + std::cout << "SCC " << counter << " contains:" << std::endl; + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + std::cout << *sccIt << ", "; + } + std::cout << std::endl; // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; converged = false; while (!converged && localIterations < maximalNumberOfIterations) { // Compute x' = A*x + b. - A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); /* Versus: @@ -85,17 +93,17 @@ namespace storm { // Reduce the vector x' by applying min/max for all non-deterministic choices. if (minimize) { - storm::utility::reduceVectorMin(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + //storm::utility::reduceVectorMin(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); } else { - storm::utility::reduceVectorMax(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + //storm::utility::reduceVectorMax(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); } // Determine whether the method converged. // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher // running time. In fact, it is faster. This has to be investigated. // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::equalModuloPrecision(*currentX, *newX, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); // Update environment variables. swap = currentX; @@ -120,7 +128,7 @@ namespace storm { } if (!xMemoryProvided) { - delete copyX; + delete newX; } if (!multiplyResultMemoryProvided) { diff --git a/src/utility/graph.h b/src/utility/graph.h index 8cb7c044e..9459014bd 100644 --- a/src/utility/graph.h +++ b/src/utility/graph.h @@ -569,7 +569,7 @@ namespace storm { LOG4CPLUS_ERROR(logger, "Provided matrix is required to be square."); throw storm::exceptions::InvalidArgumentException() << "Provided matrix is required to be square."; } - + uint_fast64_t numberOfStates = matrix.getRowCount(); // Prepare the result. This relies on the matrix being square. @@ -598,12 +598,12 @@ namespace storm { recursionStepBackward: for (; successorIterator != matrix.end(currentState); ++successorIterator) { - if (!visitedStates.get(successorIterator.first)) { + if (!visitedStates.get(successorIterator->first)) { // Put unvisited successor on top of our recursion stack and remember that. - recursionStack.push_back(successorIterator.first); + recursionStack.push_back(successorIterator->first); // Also, put initial value for iterator on corresponding recursion stack. - iteratorRecursionStack.push_back(matrix.begin(successorIterator.first)); + iteratorRecursionStack.push_back(matrix.begin(successorIterator->first)); goto recursionStepForward; } diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 7ccb05369..114de1b47 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -9,8 +9,9 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); - + //storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.lab", ""); + ASSERT_EQ(parser.getType(), storm::models::MDP); std::shared_ptr> mdp = parser.getModel>(); @@ -152,7 +153,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { } TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { - storm::settings::Settings* s = storm::settings::Settings::getInstance(); + /*storm::settings::Settings* s = storm::settings::Settings::getInstance(); storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew"); ASSERT_EQ(parser.getType(), storm::models::MDP); @@ -220,5 +221,5 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { result = mc.checkNoBoundOperator(*rewardFormula);; ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - delete rewardFormula; + delete rewardFormula;*/ } From 3052b19c58bdfd9da7d7b51c50078079f63ec023 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 12 Feb 2014 02:56:42 +0100 Subject: [PATCH 04/43] Created a "real" scc example. Modified the TopologicalValueIterationMdpPrctlModelCheckerTest.cpp to show the crash when not using TBB. Former-commit-id: 98b47e957389bbaf164aaede53acc58542b0165c --- examples/mdp/scc/scc.pctl | 4 ++-- ...pologicalValueIterationMdpPrctlModelCheckerTest.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/mdp/scc/scc.pctl b/examples/mdp/scc/scc.pctl index 393670a26..8a853a969 100644 --- a/examples/mdp/scc/scc.pctl +++ b/examples/mdp/scc/scc.pctl @@ -1,2 +1,2 @@ -Pmin=? [ F a ] -Pmax=? [ F a ] \ No newline at end of file +Pmin=? [ F end ] +Pmax=? [ F end ] \ No newline at end of file diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 114de1b47..874170a4f 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -16,12 +16,12 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { std::shared_ptr> mdp = parser.getModel>(); - ASSERT_EQ(mdp->getNumberOfStates(), 169ull); - ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); + ASSERT_EQ(mdp->getNumberOfStates(), 11ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 17ull); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); - storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("end"); storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); @@ -30,7 +30,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete probFormula; - + /* apFormula = new storm::property::prctl::Ap("two"); eventuallyFormula = new storm::property::prctl::Eventually(apFormula); probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); @@ -149,7 +149,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - delete rewardFormula; + delete rewardFormula;*/ } TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { From 17d9df1ac7541dc40f2358719496275339e19d8c Mon Sep 17 00:00:00 2001 From: dehnert Date: Wed, 12 Feb 2014 15:26:49 +0100 Subject: [PATCH 05/43] Some fixes to make the branch compile with clang. Former-commit-id: f9127a23c9c2a6bbaf657d3edc4c1869884bc318 --- .../TopologicalValueIterationMdpPrctlModelChecker.h | 2 +- ...IterationNondeterministicLinearEquationSolver.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 7093f4cb3..7b9c58d52 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -39,7 +39,7 @@ public: * constructed model checker will have the model of the given model checker as its associated model. */ explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) - : SparseMdpPrctlModelChecker(modelchecker), minimumOperatorStack() { + : SparseMdpPrctlModelChecker(modelchecker) { // Intentionally left empty. } diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 6ff0310ba..1af63b729 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -17,13 +17,13 @@ namespace storm { storm::settings::Settings* settings = storm::settings::Settings::getInstance(); // Get appropriate settings. - maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); - precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); - relative = !settings->isSet("absolute"); + this->maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); + this->precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); + this->relative = !settings->isSet("absolute"); } template - TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { // Intentionally left empty. } @@ -80,7 +80,7 @@ namespace storm { // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; converged = false; - while (!converged && localIterations < maximalNumberOfIterations) { + while (!converged && localIterations < this->maximalNumberOfIterations) { // Compute x' = A*x + b. //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); @@ -103,7 +103,7 @@ namespace storm { // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher // running time. In fact, it is faster. This has to be investigated. // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, this->precision, this->relative); // Update environment variables. swap = currentX; From 98b0bcf1872b378e689b2b035a1230953752e368 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 24 Feb 2014 04:43:07 +0100 Subject: [PATCH 07/43] Reimplemented the TopologicalValueIterationNondeterministicLinearEquationSolver with splitting into submatrices. Added a dtmc example for tests with the StronglyConnectedComponentDecomposition. Former-commit-id: 0c33793fe6ca844ac775cd29d050846626526a4d --- examples/mdp/scc/scc.pctl | 4 +- ...onNondeterministicLinearEquationSolver.cpp | 124 ++++++++++++++---- ...tronglyConnectedComponentDecomposition.cpp | 2 +- src/storm.cpp | 18 ++- src/utility/OsDetection.h | 5 + ...ValueIterationMdpPrctlModelCheckerTest.cpp | 32 ++++- ...glyConnectedComponentDecompositionTest.cpp | 28 ++++ 7 files changed, 179 insertions(+), 34 deletions(-) diff --git a/examples/mdp/scc/scc.pctl b/examples/mdp/scc/scc.pctl index 8a853a969..501655389 100644 --- a/examples/mdp/scc/scc.pctl +++ b/examples/mdp/scc/scc.pctl @@ -1,2 +1,2 @@ -Pmin=? [ F end ] -Pmax=? [ F end ] \ No newline at end of file +Pmin=? [ (!statetwo) U end ] +Pmax=? [ (!statetwo) U end ] \ No newline at end of file diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 1af63b729..c82a40af5 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -7,6 +7,11 @@ #include "src/utility/graph.h" #include "src/models/PseudoModel.h" #include "src/storage/StronglyConnectedComponentDecomposition.h" +#include "src/exceptions/IllegalArgumentException.h" + +#include "log4cplus/logger.h" +#include "log4cplus/loggingmacros.h" +extern log4cplus::Logger logger; namespace storm { namespace solver { @@ -39,11 +44,29 @@ namespace storm { //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); + std::cout << "TopoSolver Input Matrix: " << A.getRowCount() << " x " << A.getColumnCount() << " with " << A.getEntryCount() << " Entries:" << std::endl; + + uint_fast64_t const rowCount = A.getRowCount(); + for (uint_fast64_t row = 0; row < rowCount; ++row) { + std::cout << "Row " << row << ": "; + auto const& rowElement = A.getRow(row); + for (auto rowIt = rowElement.begin(); rowIt != rowElement.end(); ++rowIt) { + std::cout << rowIt->first << " [" << rowIt->second << "], "; + } + std::cout << std::endl; + } + + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); - + //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); // Set up the environment for the power method. @@ -52,12 +75,12 @@ namespace storm { multiplyResult = new std::vector(A.getRowCount()); multiplyResultMemoryProvided = false; } - std::vector* currentX = &x; - bool xMemoryProvided = true; - if (newX == nullptr) { - newX = new std::vector(x.size()); - xMemoryProvided = false; - } + std::vector* currentX = nullptr; + //bool xMemoryProvided = true; + //if (newX == nullptr) { + // newX = new std::vector(x.size()); + // xMemoryProvided = false; + //} std::vector* swap = nullptr; uint_fast64_t currentMaxLocalIterations = 0; uint_fast64_t localIterations = 0; @@ -68,20 +91,62 @@ namespace storm { // solved after all SCCs it depends on have been solved. int counter = 0; std::cout << "Solving Equation System using the TopologicalValueIterationNon..." << std::endl; + std::cout << "Found " << sccDecomposition.size() << " SCCs." << std::endl; + for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - std::cout << "SCC " << counter << " contains:" << std::endl; - for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { - std::cout << *sccIt << ", "; + std::cout << "SCC " << counter << " from Index " << *sccIndexIt << " contains:" << std::endl; + ++counter; + for (auto state : scc) { + std::cout << state << ", "; } std::cout << std::endl; + // Generate a submatrix + storm::storage::BitVector subMatrixIndices(rowCount, scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices, true); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Preprocess all dependant states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + for (uint_fast64_t state: scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(innerIndex + 1) = sccSubNondeterministicChoiceIndices.at(innerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->first)) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) - (rowIt->second * x.at(rowIt->first)); + } + } + } + ++innerIndex; + } + // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, sccSubB); + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); @@ -93,43 +158,46 @@ namespace storm { // Reduce the vector x' by applying min/max for all non-deterministic choices. if (minimize) { - //storm::utility::reduceVectorMin(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + storm::utility::vector::reduceVectorMin(*multiplyResult, *swap, sccSubNondeterministicChoiceIndices); } else { - //storm::utility::reduceVectorMax(*multiplyResult, *newX, scc, nondeterministicChoiceIndices); + storm::utility::vector::reduceVectorMax(*multiplyResult, *swap, sccSubNondeterministicChoiceIndices); } // Determine whether the method converged. // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher // running time. In fact, it is faster. This has to be investigated. // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, this->precision, this->relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); // Update environment variables. - swap = currentX; - currentX = newX; - newX = swap; + std::swap(currentX, swap); + ++localIterations; ++globalIterations; } + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state: scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; + } + + // Since the pointers for swapping in the calculation point to temps they should not be valide anymore + currentX = nullptr; + swap = nullptr; + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep // track of the maximum. if (localIterations > currentMaxLocalIterations) { currentMaxLocalIterations = localIterations; } } - - // If we performed an odd number of global iterations, we need to swap the x and currentX, because the newest - // result is currently stored in currentX, but x is the output vector. - // TODO: Check whether this is correct or should be put into the for-loop over SCCs. - if (globalIterations % 2 == 1) { - std::swap(x, *currentX); - } - if (!xMemoryProvided) { - delete newX; - } + //if (!xMemoryProvided) { + // delete newX; + //} if (!multiplyResultMemoryProvided) { delete multiplyResult; diff --git a/src/storage/StronglyConnectedComponentDecomposition.cpp b/src/storage/StronglyConnectedComponentDecomposition.cpp index b46a5706f..b1100f369 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.cpp +++ b/src/storage/StronglyConnectedComponentDecomposition.cpp @@ -35,7 +35,7 @@ namespace storm { template StronglyConnectedComponentDecomposition::StronglyConnectedComponentDecomposition(storm::models::AbstractModel const& model, storm::storage::BitVector const& subsystem, bool dropNaiveSccs, bool onlyBottomSccs) { storm::models::ModelBasedPseudoModel encapsulatedModel(model); - performSccDecomposition(*static_cast*>(&encapsulatedModel), subsystem, dropNaiveSccs, onlyBottomSccs); + performSccDecomposition(encapsulatedModel, subsystem, dropNaiveSccs, onlyBottomSccs); } template diff --git a/src/storm.cpp b/src/storm.cpp index 9149b2c99..fadcb3db1 100644 --- a/src/storm.cpp +++ b/src/storm.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include "src/models/AtomicPropositionsLabeling.h" #include "src/modelchecker/prctl/SparseDtmcPrctlModelChecker.h" #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" #include "src/solver/GmmxxLinearEquationSolver.h" #include "src/solver/GmmxxNondeterministicLinearEquationSolver.h" #include "src/solver/GurobiLpSolver.h" @@ -132,6 +134,16 @@ void setUpFileLogging() { logger.addAppender(fileLogAppender); } +/*! +* Gives the current working directory +* +* @return std::string The path of the current working directory +*/ +std::string getCurrentWorkingDirectory() { + char temp[512]; + return (getcwd(temp, 512 - 1) ? std::string(temp) : std::string("")); +} + /*! * Prints the header. */ @@ -146,7 +158,8 @@ void printHeader(const int argc, const char* argv[]) { for (int i = 0; i < argc; ++i) { commandStream << argv[i] << " "; } - std::cout << "Command line: " << commandStream.str() << std::endl << std::endl; + std::cout << "Command line: " << commandStream.str() << std::endl; + std::cout << "Current working directory: " << getCurrentWorkingDirectory() << std::endl << std::endl; } /*! @@ -234,7 +247,8 @@ storm::modelchecker::prctl::AbstractModelChecker* createPrctlModelChecke */ storm::modelchecker::prctl::AbstractModelChecker* createPrctlModelChecker(storm::models::Mdp& mdp) { // Create the appropriate model checker. - return new storm::modelchecker::prctl::SparseMdpPrctlModelChecker(mdp); + //return new storm::modelchecker::prctl::SparseMdpPrctlModelChecker(mdp); + return new storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker(mdp); } /*! diff --git a/src/utility/OsDetection.h b/src/utility/OsDetection.h index 1969a6de7..37be3f0e7 100644 --- a/src/utility/OsDetection.h +++ b/src/utility/OsDetection.h @@ -4,10 +4,12 @@ #if defined __linux__ || defined __linux # define LINUX # include +# include #include // Required by ErrorHandling.h #include // Required by ErrorHandling.h #include // Required by storm.cpp, Memory Usage #include // Required by storm.cpp, Memory Usage +# define GetCurrentDir getcwd #elif defined TARGET_OS_MAC || defined __apple__ || defined __APPLE__ # define MACOSX # define _DARWIN_USE_64_BIT_INODE @@ -17,6 +19,7 @@ # include // Required by ErrorHandling.h # include // Required by storm.cpp, Memory Usage # include // Required by storm.cpp, Memory Usage +# define GetCurrentDir getcwd #elif defined _WIN32 || defined _WIN64 # define WINDOWS # ifndef NOMINMAX @@ -28,8 +31,10 @@ # include # include # include +# include # define strncpy strncpy_s # define sscanf sscanf_s +# define GetCurrentDir _getcwd // This disables Warning C4250 - Diamond Inheritance Dominance #pragma warning(disable:4250) diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 874170a4f..c2956f779 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -7,6 +7,36 @@ #include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" #include "src/parser/AutoParser.h" +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, SmallLinEqSystem) { + storm::storage::SparseMatrixBuilder matrixBuilder(3, 3); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 2, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 4.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 7.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, -1.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(3, matrix.getRowCount()); + ASSERT_EQ(3, matrix.getColumnCount()); + ASSERT_EQ(5, matrix.getEntryCount()); + + // Solve the Linear Equation System + storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver topoSolver; + + std::vector x(3); + std::vector b = { 5, 8, 2 }; + std::vector choices = { 0, 1, 2, 3 }; + + ASSERT_NO_THROW(topoSolver.solveEquationSystem(true, matrix, x, b, choices)); + + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + ASSERT_LT(std::abs(x.at(0) - 0.25), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(1) - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(2) - 5.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +} + TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); //storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); @@ -17,7 +47,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { std::shared_ptr> mdp = parser.getModel>(); ASSERT_EQ(mdp->getNumberOfStates(), 11ull); - ASSERT_EQ(mdp->getNumberOfTransitions(), 17ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 18ull); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); diff --git a/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp b/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp index 22f04d0d1..0279b72fb 100644 --- a/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp +++ b/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp @@ -44,3 +44,31 @@ TEST(StronglyConnectedComponentDecomposition, FullSystem2) { markovAutomaton = nullptr; } + +TEST(StronglyConnectedComponentDecomposition, MatrixBasedSystem) { + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.lab", "", ""); + std::shared_ptr> dtmc = parser.getModel>(); + + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition; + ASSERT_NO_THROW(sccDecomposition = storm::storage::StronglyConnectedComponentDecomposition(*dtmc, true, false)); + + ASSERT_EQ(sccDecomposition.size(), 3); + + // Now, because there is no ordering we have to check the contents of the MECs in a symmetrical way. + storm::storage::StateBlock const& scc1 = sccDecomposition[0]; + storm::storage::StateBlock const& scc2 = sccDecomposition[1]; + storm::storage::StateBlock const& scc3 = sccDecomposition[2]; + + std::vector correctScc1 = { 1, 2, 3, 4 }; + std::vector correctScc2 = { 5, 6, 7, 8 }; + std::vector correctScc3 = { 0 }; + + ASSERT_TRUE(scc1 == storm::storage::StateBlock(correctScc1.begin(), correctScc1.end()) || scc1 == storm::storage::StateBlock(correctScc2.begin(), correctScc2.end()) || scc1 == storm::storage::StateBlock(correctScc3.begin(), correctScc3.end())); + ASSERT_TRUE(scc2 == storm::storage::StateBlock(correctScc1.begin(), correctScc1.end()) || scc2 == storm::storage::StateBlock(correctScc2.begin(), correctScc2.end()) || scc2 == storm::storage::StateBlock(correctScc3.begin(), correctScc3.end())); + ASSERT_TRUE(scc3 == storm::storage::StateBlock(correctScc1.begin(), correctScc1.end()) || scc3 == storm::storage::StateBlock(correctScc2.begin(), correctScc2.end()) || scc3 == storm::storage::StateBlock(correctScc3.begin(), correctScc3.end())); + + ASSERT_NO_THROW(sccDecomposition = storm::storage::StronglyConnectedComponentDecomposition(*dtmc, true, true)); + ASSERT_EQ(2, sccDecomposition.size()); + + dtmc = nullptr; +} \ No newline at end of file From 5b1513e9e5bef42798e009b2052f9cc8f4ae6385 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 24 Feb 2014 04:43:36 +0100 Subject: [PATCH 08/43] Fixed issues with unused but named variables. Former-commit-id: 54548657798e2db7476d0d4855d62a8743d364e1 --- src/counterexamples/SMTMinimalCommandSetGenerator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/counterexamples/SMTMinimalCommandSetGenerator.h b/src/counterexamples/SMTMinimalCommandSetGenerator.h index f95bc2806..6ccee589d 100644 --- a/src/counterexamples/SMTMinimalCommandSetGenerator.h +++ b/src/counterexamples/SMTMinimalCommandSetGenerator.h @@ -1801,14 +1801,14 @@ namespace storm { phiStates = untilFormula.getLeft().check(modelchecker); psiStates = untilFormula.getRight().check(modelchecker); - } catch (std::bad_cast const& e) { + } catch (std::bad_cast const&) { // If the nested formula was not an until formula, it remains to check whether it's an eventually formula. try { storm::property::prctl::Eventually const& eventuallyFormula = dynamic_cast const&>(pathFormula); phiStates = storm::storage::BitVector(labeledMdp.getNumberOfStates(), true); psiStates = eventuallyFormula.getChild().check(modelchecker); - } catch (std::bad_cast const& e) { + } catch (std::bad_cast const&) { // If the nested formula is neither an until nor a finally formula, we throw an exception. throw storm::exceptions::InvalidPropertyException() << "Formula nested inside probability bound operator must be an until or eventually formula for counterexample generation."; } From 19ca7bedaa687430b6aec3829a228e3d1df8f191 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 24 Feb 2014 04:44:49 +0100 Subject: [PATCH 09/43] Added explicit casts to C-style casts Former-commit-id: 083816623722d2d4d37f8e402168cf94580d1d88 --- src/counterexamples/PathBasedSubsystemGenerator.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/counterexamples/PathBasedSubsystemGenerator.h b/src/counterexamples/PathBasedSubsystemGenerator.h index 58678a390..b7a9424fa 100644 --- a/src/counterexamples/PathBasedSubsystemGenerator.h +++ b/src/counterexamples/PathBasedSubsystemGenerator.h @@ -567,7 +567,7 @@ public: allowedStates = storm::storage::BitVector(targetStates.size(), true); } else if(globally != nullptr){ - //eventually reaching a state without property visiting only states with property + // eventually reaching a state without property visiting only states with property allowedStates = globally->getChild().check(modelCheck); targetStates = storm::storage::BitVector(allowedStates); targetStates.complement(); @@ -588,9 +588,9 @@ public: // estimate the path count using the models state count as well as the probability bound uint_fast8_t const minPrec = 10; uint_fast64_t const stateCount = model.getNumberOfStates(); - uint_fast64_t const stateEstimate = ((T) stateCount) * bound; + uint_fast64_t const stateEstimate = static_cast((static_cast(stateCount)) * bound); - //since this only has a good effect on big models -> use only if model has at least 10^5 states + // since this only has a good effect on big models -> use only if model has at least 10^5 states uint_fast64_t precision = stateEstimate > 100000 ? stateEstimate/1000 : minPrec; @@ -683,12 +683,12 @@ public: //std::cout << "Diff: " << diff << std::endl; //std::cout << "Path count: " << pathCount << std::endl; - //Are we critical? + // Are we critical? if(subSysProb >= bound){ break; } else if (stateEstimate > 100000){ - precision = (stateEstimate/1000) - ((stateEstimate/1000) - minPrec)*(subSysProb/bound); + precision = static_cast((stateEstimate / 1000.0) - ((stateEstimate / 1000.0) - minPrec) * (subSysProb/bound)); } } } From f049a9f0af8ced590af0906f69c4bbf8e22b3e5c Mon Sep 17 00:00:00 2001 From: dehnert Date: Mon, 24 Feb 2014 16:24:48 +0100 Subject: [PATCH 10/43] Bugfix for topological equation solver. Former-commit-id: b8563f8b3e03e4141096d488494889a53f2ad139 --- ...onNondeterministicLinearEquationSolver.cpp | 53 ++++++++++++------- src/utility/ErrorHandling.h | 2 +- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 43 ++++++++------- 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index c82a40af5..2d1573f94 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -56,7 +56,10 @@ namespace storm { std::cout << std::endl; } - + std::cout << A << std::endl; + std::cout << nondeterministicChoiceIndices << std::endl; + std::cout << b << std::endl; + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); @@ -70,11 +73,11 @@ namespace storm { std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); // Set up the environment for the power method. - bool multiplyResultMemoryProvided = true; - if (multiplyResult == nullptr) { - multiplyResult = new std::vector(A.getRowCount()); - multiplyResultMemoryProvided = false; - } +// bool multiplyResultMemoryProvided = true; +// if (multiplyResult == nullptr) { +// multiplyResult = new std::vector(A.getRowCount()); +// multiplyResultMemoryProvided = false; +// } std::vector* currentX = nullptr; //bool xMemoryProvided = true; //if (newX == nullptr) { @@ -104,13 +107,14 @@ namespace storm { std::cout << std::endl; // Generate a submatrix - storm::storage::BitVector subMatrixIndices(rowCount, scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices, true); + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices); std::vector sccSubB(sccSubmatrix.getRowCount()); storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); std::vector sccSubX(sccSubmatrix.getColumnCount()); std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + // Prepare the pointers for swapping in the calculation currentX = &sccSubX; swap = &sccSubXSwap; @@ -119,33 +123,40 @@ namespace storm { std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); sccSubNondeterministicChoiceIndices.at(0) = 0; + std::cout << "subb: " << sccSubB << std::endl; // Preprocess all dependant states // Remove outgoing transitions and create the ChoiceIndices uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; for (uint_fast64_t state: scc) { // Choice Indices - sccSubNondeterministicChoiceIndices.at(innerIndex + 1) = sccSubNondeterministicChoiceIndices.at(innerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { if (!subMatrixIndices.get(rowIt->first)) { // This is an outgoing transition of a state in the SCC to a state not included in the SCC // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) - (rowIt->second * x.at(rowIt->first)); + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->second * x.at(rowIt->first)); } } + ++innerIndex; } - ++innerIndex; + ++outerIndex; } + + std::cout << sccSubmatrix << std::endl; + std::cout << sccSubNondeterministicChoiceIndices << std::endl; + std::cout << sccSubB << std::endl; // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, sccSubB); + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); @@ -158,10 +169,10 @@ namespace storm { // Reduce the vector x' by applying min/max for all non-deterministic choices. if (minimize) { - storm::utility::vector::reduceVectorMin(*multiplyResult, *swap, sccSubNondeterministicChoiceIndices); + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); } else { - storm::utility::vector::reduceVectorMax(*multiplyResult, *swap, sccSubNondeterministicChoiceIndices); + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); } // Determine whether the method converged. @@ -180,9 +191,11 @@ namespace storm { // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; for (uint_fast64_t state: scc) { + std::cout << state << " = " << currentX->at(innerIndex) << std::endl; x.at(state) = currentX->at(innerIndex); ++innerIndex; } + std::cout << x << std::endl; // Since the pointers for swapping in the calculation point to temps they should not be valide anymore currentX = nullptr; @@ -199,9 +212,9 @@ namespace storm { // delete newX; //} - if (!multiplyResultMemoryProvided) { - delete multiplyResult; - } +// if (!multiplyResultMemoryProvided) { +// delete multiplyResult; +// } // Check if the solver converged and issue a warning otherwise. if (converged) { diff --git a/src/utility/ErrorHandling.h b/src/utility/ErrorHandling.h index 0872757b6..5b9b50bdc 100644 --- a/src/utility/ErrorHandling.h +++ b/src/utility/ErrorHandling.h @@ -164,7 +164,7 @@ VOID CALLBACK stormWindowsSetTimerCallBack( void stormSetAlarm(uint_fast64_t timeoutSeconds) { #ifndef WINDOWS - alarm(timeout); + alarm(timeoutSeconds); #else // This needs more research (http://msdn.microsoft.com/en-us/library/windows/desktop/ms644906(v=vs.85).aspx) UINT_PTR retVal = SetTimer(NULL, 0, static_cast(timeoutSeconds * 1000), static_cast(&stormWindowsSetTimerCallBack)); diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index c2956f779..41f75fd46 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -8,50 +8,53 @@ #include "src/parser/AutoParser.h" TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, SmallLinEqSystem) { - storm::storage::SparseMatrixBuilder matrixBuilder(3, 3); - ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 2, 1.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 4.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 7.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, -1.0)); + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 0.1)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 2, 0.9)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(3, 2, 0.8)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(3, 3, 0.2)); storm::storage::SparseMatrix matrix; ASSERT_NO_THROW(matrix = matrixBuilder.build()); - ASSERT_EQ(3, matrix.getRowCount()); - ASSERT_EQ(3, matrix.getColumnCount()); - ASSERT_EQ(5, matrix.getEntryCount()); + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(6, matrix.getEntryCount()); // Solve the Linear Equation System storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver topoSolver; - std::vector x(3); - std::vector b = { 5, 8, 2 }; - std::vector choices = { 0, 1, 2, 3 }; + std::vector x(4); + std::vector b = { 1, 2, 3, 4 }; + std::vector choices = { 0, 1, 2, 3, 4 }; ASSERT_NO_THROW(topoSolver.solveEquationSystem(true, matrix, x, b, choices)); storm::settings::Settings* s = storm::settings::Settings::getInstance(); - ASSERT_LT(std::abs(x.at(0) - 0.25), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - ASSERT_LT(std::abs(x.at(1) - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - ASSERT_LT(std::abs(x.at(2) - 5.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(0) - 2.9), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(1) - 2), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(2) - 3), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(x.at(3) - 3.2), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); } TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); - //storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.lab", ""); + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); + //storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.lab", ""); ASSERT_EQ(parser.getType(), storm::models::MDP); std::shared_ptr> mdp = parser.getModel>(); - ASSERT_EQ(mdp->getNumberOfStates(), 11ull); - ASSERT_EQ(mdp->getNumberOfTransitions(), 18ull); +// ASSERT_EQ(mdp->getNumberOfStates(), 11ull); +// ASSERT_EQ(mdp->getNumberOfTransitions(), 18ull); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); - storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("end"); +// storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("end"); + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); From a4ae226e57ef7580974e9e6fb5b977ee025b1a45 Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 24 Feb 2014 19:59:05 +0100 Subject: [PATCH 11/43] Removed debug output from our debugging session Former-commit-id: 43a0c63a6cd907e5c871ceb089d33435325d23bd --- ...IterationNondeterministicLinearEquationSolver.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 2d1573f94..0ecc48e2e 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -56,10 +56,7 @@ namespace storm { std::cout << std::endl; } - std::cout << A << std::endl; - std::cout << nondeterministicChoiceIndices << std::endl; - std::cout << b << std::endl; - + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); @@ -123,7 +120,6 @@ namespace storm { std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); sccSubNondeterministicChoiceIndices.at(0) = 0; - std::cout << "subb: " << sccSubB << std::endl; // Preprocess all dependant states // Remove outgoing transitions and create the ChoiceIndices uint_fast64_t innerIndex = 0; @@ -145,10 +141,6 @@ namespace storm { } ++outerIndex; } - - std::cout << sccSubmatrix << std::endl; - std::cout << sccSubNondeterministicChoiceIndices << std::endl; - std::cout << sccSubB << std::endl; // For the current SCC, we need to perform value iteration until convergence. localIterations = 0; @@ -191,11 +183,9 @@ namespace storm { // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; for (uint_fast64_t state: scc) { - std::cout << state << " = " << currentX->at(innerIndex) << std::endl; x.at(state) = currentX->at(innerIndex); ++innerIndex; } - std::cout << x << std::endl; // Since the pointers for swapping in the calculation point to temps they should not be valide anymore currentX = nullptr; From af650b6666b30a2589429f2245a7349e04231254 Mon Sep 17 00:00:00 2001 From: PBerger Date: Tue, 25 Feb 2014 03:50:24 +0100 Subject: [PATCH 12/43] Removed debug outputs from the TopologicalValueIterationNondeterministicLinearEquationSolver Fixed the topo tests, since the comparison values are a bit off for this solver Former-commit-id: 56c763b37a53a7d662a7fc7d0f71430c0be53a76 --- ...onNondeterministicLinearEquationSolver.cpp | 22 ------- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 61 +++++-------------- 2 files changed, 14 insertions(+), 69 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 0ecc48e2e..f6c4473fa 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -44,19 +44,6 @@ namespace storm { //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); - std::cout << "TopoSolver Input Matrix: " << A.getRowCount() << " x " << A.getColumnCount() << " with " << A.getEntryCount() << " Entries:" << std::endl; - - uint_fast64_t const rowCount = A.getRowCount(); - for (uint_fast64_t row = 0; row < rowCount; ++row) { - std::cout << "Row " << row << ": "; - auto const& rowElement = A.getRow(row); - for (auto rowIt = rowElement.begin(); rowIt != rowElement.end(); ++rowIt) { - std::cout << rowIt->first << " [" << rowIt->second << "], "; - } - std::cout << std::endl; - } - - storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); @@ -90,19 +77,10 @@ namespace storm { // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only // solved after all SCCs it depends on have been solved. int counter = 0; - std::cout << "Solving Equation System using the TopologicalValueIterationNon..." << std::endl; - std::cout << "Found " << sccDecomposition.size() << " SCCs." << std::endl; for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - std::cout << "SCC " << counter << " from Index " << *sccIndexIt << " contains:" << std::endl; - ++counter; - for (auto state : scc) { - std::cout << state << ", "; - } - std::cout << std::endl; - // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices); diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 41f75fd46..f39d17022 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -7,38 +7,6 @@ #include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" #include "src/parser/AutoParser.h" -TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, SmallLinEqSystem) { - storm::storage::SparseMatrixBuilder matrixBuilder(4, 4); - ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 0.1)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 2, 0.9)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 1.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 1.0)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(3, 2, 0.8)); - ASSERT_NO_THROW(matrixBuilder.addNextValue(3, 3, 0.2)); - - storm::storage::SparseMatrix matrix; - ASSERT_NO_THROW(matrix = matrixBuilder.build()); - - ASSERT_EQ(4, matrix.getRowCount()); - ASSERT_EQ(4, matrix.getColumnCount()); - ASSERT_EQ(6, matrix.getEntryCount()); - - // Solve the Linear Equation System - storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver topoSolver; - - std::vector x(4); - std::vector b = { 1, 2, 3, 4 }; - std::vector choices = { 0, 1, 2, 3, 4 }; - - ASSERT_NO_THROW(topoSolver.solveEquationSystem(true, matrix, x, b, choices)); - - storm::settings::Settings* s = storm::settings::Settings::getInstance(); - ASSERT_LT(std::abs(x.at(0) - 2.9), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - ASSERT_LT(std::abs(x.at(1) - 2), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - ASSERT_LT(std::abs(x.at(2) - 3), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - ASSERT_LT(std::abs(x.at(3) - 3.2), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); -} - TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); @@ -48,12 +16,11 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { std::shared_ptr> mdp = parser.getModel>(); -// ASSERT_EQ(mdp->getNumberOfStates(), 11ull); -// ASSERT_EQ(mdp->getNumberOfTransitions(), 18ull); + ASSERT_EQ(mdp->getNumberOfStates(), 169ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); -// storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("end"); storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); @@ -63,7 +30,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete probFormula; - /* + apFormula = new storm::property::prctl::Ap("two"); eventuallyFormula = new storm::property::prctl::Eventually(apFormula); probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); @@ -120,7 +87,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = mc.checkNoBoundOperator(*rewardFormula); - ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -129,7 +96,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = mc.checkNoBoundOperator(*rewardFormula);; - ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete rewardFormula; storm::parser::AutoParser stateRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", ""); @@ -138,7 +105,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { std::shared_ptr> stateRewardMdp = stateRewardParser.getModel>(); - storm::modelchecker::prctl::SparseMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp, std::shared_ptr>(new storm::solver::NativeNondeterministicLinearEquationSolver())); + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp); apFormula = new storm::property::prctl::Ap("done"); reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); @@ -146,7 +113,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); - ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -155,7 +122,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); - ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete rewardFormula; storm::parser::AutoParser stateAndTransitionRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); @@ -164,7 +131,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { std::shared_ptr> stateAndTransitionRewardMdp = stateAndTransitionRewardParser.getModel>(); - storm::modelchecker::prctl::SparseMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp, std::shared_ptr>(new storm::solver::NativeNondeterministicLinearEquationSolver())); + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); apFormula = new storm::property::prctl::Ap("done"); reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); @@ -172,7 +139,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); - ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + ASSERT_LT(std::abs(result[0] - 14.6666581), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -181,12 +148,12 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); - ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - delete rewardFormula;*/ + ASSERT_LT(std::abs(result[0] - 14.666663), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; } TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { - /*storm::settings::Settings* s = storm::settings::Settings::getInstance(); + storm::settings::Settings* s = storm::settings::Settings::getInstance(); storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew"); ASSERT_EQ(parser.getType(), storm::models::MDP); @@ -254,5 +221,5 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { result = mc.checkNoBoundOperator(*rewardFormula);; ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); - delete rewardFormula;*/ + delete rewardFormula; } From d5828043dee6af484057a80b66317d6949fdb9b8 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 26 Feb 2014 04:25:34 +0100 Subject: [PATCH 13/43] Added first signs of the CUDA Extension for Storm. Former-commit-id: b02385cd822d24e2a3ce1970909bfcbc4041f5dc --- resources/cudaForStorm/CMakeLists.txt | 334 ++++++++++++++++++ resources/cudaForStorm/src/cudaTests.h | 124 +++++++ resources/cudaForStorm/src/main.cpp | 62 ++++ .../cudaForStorm/srcCuda/allCudaKernels.h | 4 + resources/cudaForStorm/srcCuda/bandWidth.cu | 0 resources/cudaForStorm/srcCuda/bandWidth.h | 0 resources/cudaForStorm/srcCuda/basicAdd.cu | 286 +++++++++++++++ resources/cudaForStorm/srcCuda/basicAdd.h | 9 + .../cudaForStorm/srcCuda/kernelSwitchTest.cu | 39 ++ .../cudaForStorm/srcCuda/kernelSwitchTest.h | 1 + resources/cudaForStorm/srcCuda/utility.cu | 19 + resources/cudaForStorm/srcCuda/utility.h | 3 + 12 files changed, 881 insertions(+) create mode 100644 resources/cudaForStorm/CMakeLists.txt create mode 100644 resources/cudaForStorm/src/cudaTests.h create mode 100644 resources/cudaForStorm/src/main.cpp create mode 100644 resources/cudaForStorm/srcCuda/allCudaKernels.h create mode 100644 resources/cudaForStorm/srcCuda/bandWidth.cu create mode 100644 resources/cudaForStorm/srcCuda/bandWidth.h create mode 100644 resources/cudaForStorm/srcCuda/basicAdd.cu create mode 100644 resources/cudaForStorm/srcCuda/basicAdd.h create mode 100644 resources/cudaForStorm/srcCuda/kernelSwitchTest.cu create mode 100644 resources/cudaForStorm/srcCuda/kernelSwitchTest.h create mode 100644 resources/cudaForStorm/srcCuda/utility.cu create mode 100644 resources/cudaForStorm/srcCuda/utility.h diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt new file mode 100644 index 000000000..a3d1e7f4f --- /dev/null +++ b/resources/cudaForStorm/CMakeLists.txt @@ -0,0 +1,334 @@ +cmake_minimum_required (VERSION 2.8.6) + +# Set project name +project (cudaForStorm CXX C) + +# Set the version number +set (STORM_CPP_VERSION_MAJOR 1) +set (STORM_CPP_VERSION_MINOR 0) + +# Add base folder for better inclusion paths +include_directories("${PROJECT_SOURCE_DIR}") +include_directories("${PROJECT_SOURCE_DIR}/src") + +message(STATUS "CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") + +############################################################# +## +## CMake options of StoRM +## +############################################################# +option(DEBUG "Sets whether the DEBUG mode is used" ON) +option(USE_POPCNT "Sets whether the popcnt instruction is going to be used." ON) +option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) +option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) +option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF) +set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") +set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") +set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") +set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") + +############################################################# +## +## Inclusion of required libraries +## +############################################################# + +# Add the resources/cmake folder to Module Search Path for FindTBB.cmake +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") + +find_package(CUDA REQUIRED) +find_package(Doxygen REQUIRED) +find_package(Threads REQUIRED) + +# If the DEBUG option was turned on, we will target a debug version and a release version otherwise +if (DEBUG) + set (CMAKE_BUILD_TYPE "DEBUG") +else() + set (CMAKE_BUILD_TYPE "RELEASE") +endif() +message(STATUS "StoRM - Building ${CMAKE_BUILD_TYPE} version.") + +if ("${GUROBI_ROOT}" STREQUAL "") + set(ENABLE_GUROBI OFF) +else() + set(ENABLE_GUROBI ON) +endif() + +if ("${Z3_ROOT}" STREQUAL "") + set(ENABLE_Z3 OFF) +else() + set(ENABLE_Z3 ON) + set(Z3_LIB_NAME "z3") +endif() + +message(STATUS "StoRM - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message(STATUS "StoRM - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") + +############################################################# +## +## CUDA Options +## +############################################################# +SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) +set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) +set(BUILD_SHARED_LIBS OFF) +set(CUDA_SEPARABLE_COMPILATION ON) +#set(CUDA_NVCC_FLAGS "-arch=sm_30") + +############################################################# +## +## Compiler specific settings and definitions +## +############################################################# + +# Path to the no-strict-aliasing target +set(CONVERSIONHELPER_TARGET "${PROJECT_SOURCE_DIR}/src/utility/ConversionHelper.cpp") + +if(CMAKE_COMPILER_IS_GNUCC) + message(STATUS "StoRM - Using Compiler Configuration: GCC") + # Set standard flags for GCC + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") + # -Werror is atm removed as this gave some problems with existing code + # May be re-set later + # (Thomas Heinemann, 2012-12-21) + + # Turn on popcnt instruction if desired (yes by default) + if (USE_POPCNT) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") + endif(USE_POPCNT) + + # Set the no-strict-aliasing target for GCC + set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") +elseif(MSVC) + message(STATUS "StoRM - Using Compiler Configuration: MSVC") + # required for GMM to compile, ugly error directive in their code + add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) + # required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) + add_definitions(/bigobj) + # required by GTest and PrismGrammar::createIntegerVariable + add_definitions(/D_VARIADIC_MAX=10) + # Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max + add_definitions(/DNOMINMAX) + + if(ENABLE_Z3) + set(Z3_LIB_NAME "libz3") + endif() + + # MSVC does not do strict-aliasing, so no option needed +else(CLANG) + message(STATUS "StoRM - Using Compiler Configuration: Clang (LLVM)") + # As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. + set (CLANG ON) + # Set standard flags for clang + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") + if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) + set(CLANG_STDLIB libstdc++) + message(STATUS "StoRM - Linking against libstdc++") + else() + set(CLANG_STDLIB libc++) + message(STATUS "StoRM - Linking against libc++") + # Disable Cotire + set(STORM_USE_COTIRE OFF) + # Set up some Xcode specific settings + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") + endif() + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") + + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") + + # Turn on popcnt instruction if desired (yes by default) + if (USE_POPCNT) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") + endif(USE_POPCNT) + + # Set the no-strict-aliasing target for Clang + set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") +endif() + +############################################################# +## +## CMake-generated Config File for StoRM +## +############################################################# +# Base path for test files +set(STORM_CPP_TESTS_BASE_PATH "${PROJECT_SOURCE_DIR}/test") +# Gurobi Defines +if (ENABLE_GUROBI) + set(STORM_CPP_GUROBI_DEF "define") +else() + set(STORM_CPP_GUROBI_DEF "undef") +endif() + +# glpk defines +if (ENABLE_GLPK) + set(STORM_CPP_GLPK_DEF "define") +else() + set(STORM_CPP_GLPK_DEF "undef") +endif() + +# Z3 Defines +if (ENABLE_Z3) + set(STORM_CPP_Z3_DEF "define") +else() + set(STORM_CPP_Z3_DEF "undef") +endif() + +# Intel TBB Defines +if (TBB_FOUND AND ENABLE_INTELTBB) + set(STORM_CPP_INTELTBB_DEF "define") +else() + set(STORM_CPP_INTELTBB_DEF "undef") +endif() + +# Configure a header file to pass some of the CMake settings to the source code +configure_file ( + "${PROJECT_SOURCE_DIR}/../../storm-config.h.in" + "${PROJECT_BINARY_DIR}/include/storm-config.h" +) +# Add the binary dir include directory for storm-config.h +include_directories("${PROJECT_BINARY_DIR}/include") + +# Add the main source directory for includes +include_directories("${PROJECT_SOURCE_DIR}/../../src") + +############################################################# +## +## Source file aggregation and clustering +## +############################################################# +file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) +file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) + +file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") +file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") + +# Additional include files like the storm-config.h +file(GLOB_RECURSE STORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) + +# Group the headers and sources +source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) +source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) + +# Add custom additional include or link directories +if (ADDITIONAL_INCLUDE_DIRS) + message(STATUS "StoRM - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") + include_directories(${ADDITIONAL_INCLUDE_DIRS}) +endif(ADDITIONAL_INCLUDE_DIRS) +if (ADDITIONAL_LINK_DIRS) + message(STATUS "StoRM - Using additional link directories ${ADDITIONAL_LINK_DIRS}") + link_directories(${ADDITIONAL_LINK_DIRS}) +endif(ADDITIONAL_LINK_DIRS) + +############################################################# +## +## Pre executable-creation link_directories setup +## +############################################################# +if (ENABLE_GUROBI) + link_directories("${GUROBI_ROOT}/lib") +endif() +if (ENABLE_Z3) + link_directories("${Z3_ROOT}/bin") +endif() +if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) + set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") +endif () +link_directories(${Boost_LIBRARY_DIRS}) +if (TBB_FOUND AND ENABLE_INTELTBB) + link_directories(${TBB_LIBRARY_DIRS}) +endif() + +############################################################################### +## # +## Executable Creation # +## # +## All link_directories() calls MUST be made before this point # +## # +############################################################################### + +# Since this will be a library +include (GenerateExportHeader) + +add_library(cudaForStorm STATIC ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) +GENERATE_EXPORT_HEADER( cudaForStorm + BASE_NAME cudaForStorm + EXPORT_MACRO_NAME cudaForStorm_EXPORT + EXPORT_FILE_NAME cudaForStorm_Export.h + STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC +) + +############################################################# +## +## CUDA +## +############################################################# +#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) +cuda_add_library(cudaLibrary + ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} + OPTIONS -DSTUFF="" -arch=sm_30 + RELEASE -DNDEBUG + DEBUG -g -DDEBUG +) +target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) +ADD_DEPENDENCIES(cudaForStorm cudaLibrary) +target_link_libraries(cudaForStorm cudaLibrary) +message(STATUS "Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") +include_directories(${CUDA_INCLUDE_DIRS}) + +############################################################# +## +## Gurobi (optional) +## +############################################################# +if (ENABLE_GUROBI) + message (STATUS "StoRM - Linking with Gurobi") + include_directories("${GUROBI_ROOT}/include") + target_link_libraries(cudaForStorm "gurobi56") +endif(ENABLE_GUROBI) + +############################################################# +## +## glpk (optional) +## +############################################################# +if (ENABLE_GLPK) + message (STATUS "StoRM - Linking with glpk") + target_link_libraries(cudaForStorm "glpk") +endif(ENABLE_GLPK) + +############################################################# +## +## Z3 (optional) +## +############################################################# +if (ENABLE_Z3) + message (STATUS "StoRM - Linking with Z3") + include_directories("${Z3_ROOT}/include") + target_link_libraries(cudaForStorm ${Z3_LIB_NAME}) +endif(ENABLE_Z3) + +############################################################# +## +## Threads +## +############################################################# +include_directories(${THREADS_INCLUDE_DIRS}) +target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT}) + +if (MSVC) + # Add the DebugHelper DLL + set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") + target_link_libraries(cudaForStorm "Dbghelp.lib") +endif(MSVC) + +# Link against libc++abi if requested. May be needed to build on Linux systems using clang. +if (LINK_LIBCXXABI) + message (STATUS "StoRM - Linking against libc++abi.") + target_link_libraries(storm "c++abi") + target_link_libraries(storm-functional-tests "c++abi") + target_link_libraries(storm-performance-tests "c++abi") +endif(LINK_LIBCXXABI) \ No newline at end of file diff --git a/resources/cudaForStorm/src/cudaTests.h b/resources/cudaForStorm/src/cudaTests.h new file mode 100644 index 000000000..2055953ed --- /dev/null +++ b/resources/cudaForStorm/src/cudaTests.h @@ -0,0 +1,124 @@ +#include +#include "srcCuda/allCudaKernels.h" + +#include +#include +#include + +void cudaShowDevices() { + // Todo +} + +void cudaSimpleAddTest(int a, int b) { + std::cout << "Running cudaSimpleAddTest:" << std::endl; + std::cout << "a = " << a << ", b = " << b << "" << std::endl; + + int c = cuda_basicAdd(a, b); + + std::cout << "Result: " << c << "" << std::endl; +} + +void cudaArrayFmaTest(int N) { + std::cout << "Running cudaArrayFmaTest:" << std::endl; + std::cout << "N is " << N << ", resulting in " << (5 * sizeof(int) * N) << " Bytes of Data." << std::endl; + + std::cout << "Generating random input arrays." << std::endl; + + std::default_random_engine generator; + std::uniform_int_distribution distribution(0, INT32_MAX); + int dice_roll = distribution(generator); + + auto start_time = std::chrono::high_resolution_clock::now(); + + int* arrayA = new int[N]; + int* arrayB = new int[N]; + int* arrayC = new int[N]; + int* arrayD = new int[N]; + int* arrayD_CPU = new int[N]; + + for (int i = 0; i < N; ++i) { + //arrayA[i] = distribution(generator); + //arrayB[i] = distribution(generator); + //arrayC[i] = distribution(generator); + arrayA[i] = i * 1000 + 137; + arrayB[i] = i * 7000 + 1537; + arrayC[i] = i * 15000 + 97; + arrayD[i] = 0; + arrayD_CPU[i] = 0; + } + + auto end_time = std::chrono::high_resolution_clock::now(); + std::cout << "Array generation took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + std::cout << "Running FMA test on CPU." << std::endl; + + start_time = std::chrono::high_resolution_clock::now(); + cuda_arrayFmaHelper(arrayA, arrayB, arrayC, arrayD_CPU, N); + end_time = std::chrono::high_resolution_clock::now(); + std::cout << "FMA on CPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + start_time = std::chrono::high_resolution_clock::now(); + cuda_arrayFma(arrayA, arrayB, arrayC, arrayD, N); + end_time = std::chrono::high_resolution_clock::now(); + std::cout << "FMA on GPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + int errors = 0; + for (int i = 0; i < N; ++i) { + if (arrayD[i] != arrayD_CPU[i]) { + std::cout << "Error in Entry " << i << ": GPU has " << arrayD[i] << " but CPU has " << arrayD_CPU[i] << "!" << std::endl; + ++errors; + } + } + std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; +} + +void cudaArrayFmaOptimizedTest(int N, int M) { + std::cout << "Running cudaArrayFmaTest:" << std::endl; + std::cout << "N is " << N << ", resulting in " << (4 * sizeof(int) * N) << " Bytes of Data." << std::endl; + + size_t freeCudaMemory = getFreeCudaMemory(); + size_t totalCudaMemory = getTotalCudaMemory(); + int freeProzent = static_cast(((double)freeCudaMemory)/((double)totalCudaMemory) * 100); + + std::cout << "CUDA Device has " << freeCudaMemory << " Bytes of " << totalCudaMemory << " Bytes free (" << (freeProzent) << "%)." << std::endl; + + std::cout << "Generating random input arrays." << std::endl; + + std::default_random_engine generator; + std::uniform_int_distribution distribution(0, INT32_MAX); + + auto start_time = std::chrono::high_resolution_clock::now(); + + int* arrayA = new int[4 * N]; + int* arrayA_CPU = new int[4 * N]; + + for (int i = 0; i < 4*N; ++i) { + arrayA[i] = i * 1000 + i + (357854878 % (i+1)); + arrayA_CPU[i] = arrayA[i]; + } + + auto end_time = std::chrono::high_resolution_clock::now(); + std::cout << "Array generation took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + start_time = std::chrono::high_resolution_clock::now(); + cuda_arrayFmaOptimizedHelper(arrayA_CPU, N); + end_time = std::chrono::high_resolution_clock::now(); + std::cout << "FMA on CPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + start_time = std::chrono::high_resolution_clock::now(); + cuda_arrayFmaOptimized(arrayA, N, M); + end_time = std::chrono::high_resolution_clock::now(); + std::cout << "FMA on GPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + + int errors = 0; + for (int i = 0; i < N; i+=4) { + if (arrayA[i+3] != arrayA_CPU[i+3]) { + //std::cout << "Error in Entry " << i << ": GPU has " << arrayA[i+3] << " but CPU has " << arrayA_CPU[i+3] << "!" << std::endl; + ++errors; + } + } + std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; + + delete[] arrayA; + delete[] arrayA_CPU; +} \ No newline at end of file diff --git a/resources/cudaForStorm/src/main.cpp b/resources/cudaForStorm/src/main.cpp new file mode 100644 index 000000000..52f1d1678 --- /dev/null +++ b/resources/cudaForStorm/src/main.cpp @@ -0,0 +1,62 @@ +#include +#include + +#include +#include +#include + +#include "cudaTests.h" + +int main(int argc, char **argv){ + resetCudaDevice(); + + int testNumber = 0; + int N = 10000; + int M = 402653184; + if (argc > 1) { + testNumber = atoi(argv[1]); + if (argc > 2) { + N = atoi(argv[2]); + if (argc > 3) { + M = atoi(argv[3]); + } + } + } + + switch (testNumber) { + case 1: + cudaSimpleAddTest(N, M); + break; + case 2: + cudaArrayFmaTest(N); + break; + case 3: + cudaArrayFmaOptimizedTest(N, M); + break; + case 4: + cpp_cuda_bandwidthTest(M, N); + break; + case 5: + kernelSwitchTest(N); + break; + break; + // DEFAULT AND 0 + case 0: + default: + std::cout << "Available functions are:" << std::endl; + std::cout << "0 - Show this overview" << std::endl; + std::cout << "1 - cuda simpleAddTest(N, M)" << std::endl; + std::cout << "2 - cuda arrayFmaTest(N)" << std::endl; + std::cout << "3 - cuda arrayFmaOptimizedTest(N, M)" << std::endl; + std::cout << "4 - cuda bandwidthTest(M, N)" << std::endl; + std::cout << "5 - cuda kernelSwitchTest(N)" << std::endl; + std::cout << std::endl; + std::cout << "Call: " << argv[0] << " Selection [N [M]]" << std::endl; + std::cout << "Defaults:" < +#include +#include + +#include +#include + +__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { + *c = a + b; +} + +__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + // Fused Multiply Add: + // A * B + C => D + + /* + *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd + *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche + *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente + *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des + *Threads, der die Funktion gerade ausführt. + */ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < N) { + D[i] = A[i] * B[i] + C[i]; + } +} + +__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { + // Fused Multiply Add: + // A * B + C => D + + // Layout: + // A B C D A B C D A B C D + + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if ((i*M) < N) { + for (int j = i*M; j < i*M + M; ++j) { + A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; + } + } +} + +extern "C" int cuda_basicAdd(int a, int b) { + int c = 0; + int *dev_c; + cudaMalloc((void**)&dev_c, sizeof(int)); + cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); + cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); + //printf("%d + %d + 42 is %d\n", a, b, c); + cudaFree(dev_c); + return c; +} + +void cpp_cuda_bandwidthTest(int entryCount, int N) { + // Size of the Arrays + size_t arraySize = entryCount * sizeof(int); + + int* deviceIntArray; + int* hostIntArray = new int[arraySize]; + + // Allocate space on the device + auto start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; + delete[] hostIntArray; + return; + } + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + delete[] hostIntArray; + return; + } + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto copyTime = std::chrono::duration_cast(end_time - start_time).count(); + double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; + + if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; + delete[] hostIntArray; + return; + } + + // Prepare data + for (int i = 0; i < N; ++i) { + hostIntArray[i] = i * 333 + 123; + } + + // Copy data TO device + start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; + return; + } + } + end_time = std::chrono::high_resolution_clock::now(); + copyTime = std::chrono::duration_cast(end_time - start_time).count(); + mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; + + // Copy data FROM device + start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; + return; + } + } + end_time = std::chrono::high_resolution_clock::now(); + copyTime = std::chrono::duration_cast(end_time - start_time).count(); + mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; + + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; +} + +extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + // Size of the Arrays + size_t arraySize = N * sizeof(int); + + int* deviceIntArrayA; + int* deviceIntArrayB; + int* deviceIntArrayC; + int* deviceIntArrayD; + + // Allocate space on the device + if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc1!\n"); + return; + } + if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc2!\n"); + cudaFree(deviceIntArrayA); + return; + } + if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc3!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + return; + } + if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc4!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + return; + } + + // Copy data TO device + if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + + // Festlegung der Threads pro Block + int threadsPerBlock = 512; + // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + // Run kernel + cuda_kernel_arrayFma<<>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); + + // Copy data FROM device + if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + + // Free memory on device + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); +} + +extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { + // Size of the Arrays + size_t arraySize = N * sizeof(int) * 4; + + int* deviceIntArrayA; + + // Allocate space on the device + if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc1!\n"); + return; + } + +#define ONFAILFREE0() do { } while(0) +#define ONFAILFREE1(a) do { cudaFree(a); } while(0) +#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) +#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) +#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) +#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) + + // Copy data TO device + + CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); + + /*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + return; + }*/ + + // Festlegung der Threads pro Block + int threadsPerBlock = 512; + // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + // Run kernel + cuda_kernel_arrayFmaOptimized<<>>(deviceIntArrayA, N, M); + + // Copy data FROM device + if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + return; + } + + // Free memory on device + if (cudaFree(deviceIntArrayA) != cudaSuccess) { + printf("Error in cudaFree!\n"); + return; + } +} + +extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + for (int i = 0; i < N; ++i) { + D[i] = A[i] * B[i] + C[i]; + } +} + +extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { + for (int i = 0; i < N; i += 4) { + A[i+3] = A[i] * A[i+1] + A[i+2]; + } +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicAdd.h b/resources/cudaForStorm/srcCuda/basicAdd.h new file mode 100644 index 000000000..b167244e8 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicAdd.h @@ -0,0 +1,9 @@ +extern "C" int cuda_basicAdd(int a, int b); + +extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); +extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); + +extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); +extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); + +void cpp_cuda_bandwidthTest(int entryCount, int N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu new file mode 100644 index 000000000..2be10e8ca --- /dev/null +++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu @@ -0,0 +1,39 @@ +#include +#include + +__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { + *B = *A; +} + +void kernelSwitchTest(size_t N) { + int* deviceIntA; + int* deviceIntB; + + if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + + // Allocate space on the device + auto start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); + } + auto end_time = std::chrono::high_resolution_clock::now(); + std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + std::cout << "Resulting in " << (std::chrono::duration_cast(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; + + // Free memory on device + if (cudaFree(deviceIntA) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } + if (cudaFree(deviceIntB) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h new file mode 100644 index 000000000..dff8a13ff --- /dev/null +++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h @@ -0,0 +1 @@ +void kernelSwitchTest(size_t N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.cu b/resources/cudaForStorm/srcCuda/utility.cu new file mode 100644 index 000000000..9366453f9 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/utility.cu @@ -0,0 +1,19 @@ +size_t getFreeCudaMemory() { + size_t freeMemory; + size_t totalMemory; + cudaMemGetInfo(&freeMemory, &totalMemory); + + return freeMemory; +} + +size_t getTotalCudaMemory() { + size_t freeMemory; + size_t totalMemory; + cudaMemGetInfo(&freeMemory, &totalMemory); + + return totalMemory; +} + +void resetCudaDevice() { + cudaDeviceReset(); +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.h b/resources/cudaForStorm/srcCuda/utility.h new file mode 100644 index 000000000..ed25af9b6 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/utility.h @@ -0,0 +1,3 @@ +size_t getFreeCudaMemory(); +size_t getTotalCudaMemory(); +void resetCudaDevice(); \ No newline at end of file From 3e44d889582acbffa2e7eaf98a2843288e272052 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 26 Feb 2014 04:44:48 +0100 Subject: [PATCH 14/43] Fixed a bug in the FindCUDA.cmake file on the client side Former-commit-id: 6cb2d77777e899799753e342444be1fef1dadef1 --- resources/cudaForStorm/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index a3d1e7f4f..1b01ca337 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -76,6 +76,11 @@ set(BUILD_SHARED_LIBS OFF) set(CUDA_SEPARABLE_COMPILATION ON) #set(CUDA_NVCC_FLAGS "-arch=sm_30") +# Because the FindCUDA.cmake file has a path related bug, two folders have to be present +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaLibrary.dir/Debug") +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaLibrary.dir/Release") + + ############################################################# ## ## Compiler specific settings and definitions From 8781aa27b621ea31d82b37b44a9d281572afaf4b Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 27 Feb 2014 14:45:12 +0100 Subject: [PATCH 15/43] Added cudaForStorm as a dynamic library extension Former-commit-id: 31c6be4c1c5a05bfbed018ae6f3a4c6bbb918bfa --- CMakeLists.txt | 24 +++++++++++++++++++++++ resources/cudaForStorm/CMakeLists.txt | 15 ++++++++------ resources/cudaForStorm/src/cudaForStorm.h | 15 ++++++++++++++ resources/cudaForStorm/src/main.cpp | 9 ++++++++- storm-config.h.in | 3 +++ 5 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 resources/cudaForStorm/src/cudaForStorm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fbc351e4..1e6227e6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precom option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF) +option(ENABLE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF) set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") @@ -183,6 +184,13 @@ else() set(STORM_CPP_GLPK_DEF "undef") endif() +# CUDA Defines +if (ENABLE_CUDAFORSTORM) + set(STORM_CPP_CUDAFORSTORM_DEF "define") +else() + set(STORM_CPP_CUDAFORSTORM_DEF "undef") +endif() + # Z3 Defines if (ENABLE_Z3) set(STORM_CPP_Z3_DEF "define") @@ -286,6 +294,9 @@ endif() if (ENABLE_Z3) link_directories("${Z3_ROOT}/bin") endif() +if (ENABLE_CUDAFORSTORM) + link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib") +endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") endif () @@ -317,6 +328,19 @@ target_link_libraries(storm-performance-tests ${Boost_LIBRARIES}) #message(STATUS "BOOST_INCLUDE_DIRS is ${Boost_INCLUDE_DIRS}") #message(STATUS "BOOST_LIBRARY_DIRS is ${Boost_LIBRARY_DIRS}") +############################################################# +## +## CUDA For Storm +## +############################################################# +if (ENABLE_CUDAFORSTORM) + message (STATUS "StoRM - Linking with CudaForStorm") + include_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/include") + target_link_libraries(storm cudaForStorm) + target_link_libraries(storm-functional-tests cudaForStorm) + target_link_libraries(storm-performance-tests cudaForStorm) +endif(ENABLE_CUDAFORSTORM) + ############################################################# ## ## CUDD diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index 1b01ca337..ab2c2c8c7 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -27,6 +27,7 @@ set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") +set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).") ############################################################# ## @@ -258,11 +259,11 @@ endif() # Since this will be a library include (GenerateExportHeader) -add_library(cudaForStorm STATIC ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) +add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) GENERATE_EXPORT_HEADER( cudaForStorm BASE_NAME cudaForStorm EXPORT_MACRO_NAME cudaForStorm_EXPORT - EXPORT_FILE_NAME cudaForStorm_Export.h + EXPORT_FILE_NAME include/cudaForStorm_Export.h STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC ) @@ -333,7 +334,9 @@ endif(MSVC) # Link against libc++abi if requested. May be needed to build on Linux systems using clang. if (LINK_LIBCXXABI) message (STATUS "StoRM - Linking against libc++abi.") - target_link_libraries(storm "c++abi") - target_link_libraries(storm-functional-tests "c++abi") - target_link_libraries(storm-performance-tests "c++abi") -endif(LINK_LIBCXXABI) \ No newline at end of file + target_link_libraries(cudaForStorm "c++abi") +endif(LINK_LIBCXXABI) + +# Install Directive +install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib") +install(FILES "${PROJECT_SOURCE_DIR}/src/cudaForStorm.h" "${PROJECT_BINARY_DIR}/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") \ No newline at end of file diff --git a/resources/cudaForStorm/src/cudaForStorm.h b/resources/cudaForStorm/src/cudaForStorm.h new file mode 100644 index 000000000..ebcb4eaa2 --- /dev/null +++ b/resources/cudaForStorm/src/cudaForStorm.h @@ -0,0 +1,15 @@ +#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ +#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +/* + * List of exported functions in this library + */ + +cudaForStorm_EXPORT int cudaForStormTest(int value); + + + +#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/src/main.cpp b/resources/cudaForStorm/src/main.cpp index 52f1d1678..b555cd103 100644 --- a/resources/cudaForStorm/src/main.cpp +++ b/resources/cudaForStorm/src/main.cpp @@ -1,3 +1,5 @@ +#include "cudaForStorm.h" + #include #include @@ -7,7 +9,12 @@ #include "cudaTests.h" -int main(int argc, char **argv){ +int cudaForStormTest(int value) { + return value + 42; +} + + +int main_Test12345(int argc, char **argv){ resetCudaDevice(); int testNumber = 0; diff --git a/storm-config.h.in b/storm-config.h.in index d44ae122c..101928b59 100644 --- a/storm-config.h.in +++ b/storm-config.h.in @@ -22,6 +22,9 @@ // Whether GLPK is available and to be used (define/undef) #@STORM_CPP_GLPK_DEF@ STORM_HAVE_GLPK +// Whether CudaForStorm is available and to be used (define/undef) +#@STORM_CPP_CUDAFORSTORM_DEF@ STORM_HAVE_CUDAFORSTORM + // Whether Z3 is available and to be used (define/undef) #@STORM_CPP_Z3_DEF@ STORM_HAVE_Z3 From 0307007d279c074202b42e86a5bac8c5d45e04bf Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 27 Feb 2014 14:46:12 +0100 Subject: [PATCH 16/43] Fixed a non-ISOC++ Compliant call to getcwd Former-commit-id: 2d1a3c8cbb5416d9be64b807d0ab4434cdc21c8b --- src/storm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storm.cpp b/src/storm.cpp index fadcb3db1..69cb0c454 100644 --- a/src/storm.cpp +++ b/src/storm.cpp @@ -141,7 +141,7 @@ void setUpFileLogging() { */ std::string getCurrentWorkingDirectory() { char temp[512]; - return (getcwd(temp, 512 - 1) ? std::string(temp) : std::string("")); + return (_getcwd(temp, 512 - 1) ? std::string(temp) : std::string("")); } /*! From e78fd3fdcfcdcfbc76324cad8a5eccb83082d706 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sun, 2 Mar 2014 01:26:48 +0100 Subject: [PATCH 17/43] Added a function header for a Value Iteration Kernel. Removed the intermediate project from CMake Former-commit-id: 8b49570eb0facad002c116b505784f680795ef29 --- resources/cudaForStorm/CMakeLists.txt | 190 ++++-------------- .../cudaForStorm/srcCuda/allCudaKernels.h | 3 +- .../srcCuda/basicValueIteration.cu | 50 +++++ .../srcCuda/basicValueIteration.h | 4 + 4 files changed, 93 insertions(+), 154 deletions(-) create mode 100644 resources/cudaForStorm/srcCuda/basicValueIteration.cu create mode 100644 resources/cudaForStorm/srcCuda/basicValueIteration.h diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index ab2c2c8c7..1ddfe128f 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -11,20 +11,17 @@ set (STORM_CPP_VERSION_MINOR 0) include_directories("${PROJECT_SOURCE_DIR}") include_directories("${PROJECT_SOURCE_DIR}/src") -message(STATUS "CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") +message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") ############################################################# ## ## CMake options of StoRM ## ############################################################# -option(DEBUG "Sets whether the DEBUG mode is used" ON) -option(USE_POPCNT "Sets whether the popcnt instruction is going to be used." ON) +option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON) option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) -option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF) -set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") -set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") + set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).") @@ -43,28 +40,15 @@ find_package(Doxygen REQUIRED) find_package(Threads REQUIRED) # If the DEBUG option was turned on, we will target a debug version and a release version otherwise -if (DEBUG) +if (CUDAFORSTORM_DEBUG) set (CMAKE_BUILD_TYPE "DEBUG") else() set (CMAKE_BUILD_TYPE "RELEASE") endif() -message(STATUS "StoRM - Building ${CMAKE_BUILD_TYPE} version.") - -if ("${GUROBI_ROOT}" STREQUAL "") - set(ENABLE_GUROBI OFF) -else() - set(ENABLE_GUROBI ON) -endif() - -if ("${Z3_ROOT}" STREQUAL "") - set(ENABLE_Z3 OFF) -else() - set(ENABLE_Z3 ON) - set(Z3_LIB_NAME "z3") -endif() +message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.") -message(STATUS "StoRM - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") -message(STATUS "StoRM - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") +message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") ############################################################# ## @@ -78,8 +62,8 @@ set(CUDA_SEPARABLE_COMPILATION ON) #set(CUDA_NVCC_FLAGS "-arch=sm_30") # Because the FindCUDA.cmake file has a path related bug, two folders have to be present -file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaLibrary.dir/Debug") -file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaLibrary.dir/Release") +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug") +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release") ############################################################# @@ -87,28 +71,13 @@ file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaLibrary.dir/Release") ## Compiler specific settings and definitions ## ############################################################# - -# Path to the no-strict-aliasing target -set(CONVERSIONHELPER_TARGET "${PROJECT_SOURCE_DIR}/src/utility/ConversionHelper.cpp") - if(CMAKE_COMPILER_IS_GNUCC) - message(STATUS "StoRM - Using Compiler Configuration: GCC") + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC") # Set standard flags for GCC set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") - # -Werror is atm removed as this gave some problems with existing code - # May be re-set later - # (Thomas Heinemann, 2012-12-21) - - # Turn on popcnt instruction if desired (yes by default) - if (USE_POPCNT) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") - endif(USE_POPCNT) - - # Set the no-strict-aliasing target for GCC - set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") elseif(MSVC) - message(STATUS "StoRM - Using Compiler Configuration: MSVC") + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC") # required for GMM to compile, ugly error directive in their code add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) # required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) @@ -117,26 +86,18 @@ elseif(MSVC) add_definitions(/D_VARIADIC_MAX=10) # Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max add_definitions(/DNOMINMAX) - - if(ENABLE_Z3) - set(Z3_LIB_NAME "libz3") - endif() - - # MSVC does not do strict-aliasing, so no option needed else(CLANG) - message(STATUS "StoRM - Using Compiler Configuration: Clang (LLVM)") + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)") # As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. set (CLANG ON) # Set standard flags for clang set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) set(CLANG_STDLIB libstdc++) - message(STATUS "StoRM - Linking against libstdc++") + message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++") else() set(CLANG_STDLIB libc++) - message(STATUS "StoRM - Linking against libc++") - # Disable Cotire - set(STORM_USE_COTIRE OFF) + message(STATUS "StoRM (CudaPlugin) - Linking against libc++") # Set up some Xcode specific settings set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") @@ -145,14 +106,6 @@ else(CLANG) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") - - # Turn on popcnt instruction if desired (yes by default) - if (USE_POPCNT) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") - endif(USE_POPCNT) - - # Set the no-strict-aliasing target for Clang - set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") endif() ############################################################# @@ -160,35 +113,6 @@ endif() ## CMake-generated Config File for StoRM ## ############################################################# -# Base path for test files -set(STORM_CPP_TESTS_BASE_PATH "${PROJECT_SOURCE_DIR}/test") -# Gurobi Defines -if (ENABLE_GUROBI) - set(STORM_CPP_GUROBI_DEF "define") -else() - set(STORM_CPP_GUROBI_DEF "undef") -endif() - -# glpk defines -if (ENABLE_GLPK) - set(STORM_CPP_GLPK_DEF "define") -else() - set(STORM_CPP_GLPK_DEF "undef") -endif() - -# Z3 Defines -if (ENABLE_Z3) - set(STORM_CPP_Z3_DEF "define") -else() - set(STORM_CPP_Z3_DEF "undef") -endif() - -# Intel TBB Defines -if (TBB_FOUND AND ENABLE_INTELTBB) - set(STORM_CPP_INTELTBB_DEF "define") -else() - set(STORM_CPP_INTELTBB_DEF "undef") -endif() # Configure a header file to pass some of the CMake settings to the source code configure_file ( @@ -213,7 +137,7 @@ file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu" file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") # Additional include files like the storm-config.h -file(GLOB_RECURSE STORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) +file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) # Group the headers and sources source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) @@ -221,11 +145,11 @@ source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS # Add custom additional include or link directories if (ADDITIONAL_INCLUDE_DIRS) - message(STATUS "StoRM - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") + message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") include_directories(${ADDITIONAL_INCLUDE_DIRS}) endif(ADDITIONAL_INCLUDE_DIRS) if (ADDITIONAL_LINK_DIRS) - message(STATUS "StoRM - Using additional link directories ${ADDITIONAL_LINK_DIRS}") + message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}") link_directories(${ADDITIONAL_LINK_DIRS}) endif(ADDITIONAL_LINK_DIRS) @@ -234,19 +158,7 @@ endif(ADDITIONAL_LINK_DIRS) ## Pre executable-creation link_directories setup ## ############################################################# -if (ENABLE_GUROBI) - link_directories("${GUROBI_ROOT}/lib") -endif() -if (ENABLE_Z3) - link_directories("${Z3_ROOT}/bin") -endif() -if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) - set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") -endif () -link_directories(${Boost_LIBRARY_DIRS}) -if (TBB_FOUND AND ENABLE_INTELTBB) - link_directories(${TBB_LIBRARY_DIRS}) -endif() + ############################################################################### ## # @@ -255,17 +167,15 @@ endif() ## All link_directories() calls MUST be made before this point # ## # ############################################################################### - -# Since this will be a library include (GenerateExportHeader) -add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) -GENERATE_EXPORT_HEADER( cudaForStorm - BASE_NAME cudaForStorm - EXPORT_MACRO_NAME cudaForStorm_EXPORT - EXPORT_FILE_NAME include/cudaForStorm_Export.h - STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC -) +#add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) +#GENERATE_EXPORT_HEADER( cudaForStorm +# BASE_NAME cudaForStorm +# EXPORT_MACRO_NAME cudaForStorm_EXPORT +# EXPORT_FILE_NAME include/cudaForStorm_Export.h +# STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC +#) ############################################################# ## @@ -273,50 +183,24 @@ GENERATE_EXPORT_HEADER( cudaForStorm ## ############################################################# #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) -cuda_add_library(cudaLibrary +cuda_add_library(cudaForStorm ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} OPTIONS -DSTUFF="" -arch=sm_30 RELEASE -DNDEBUG DEBUG -g -DDEBUG ) -target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) -ADD_DEPENDENCIES(cudaForStorm cudaLibrary) -target_link_libraries(cudaForStorm cudaLibrary) -message(STATUS "Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") +GENERATE_EXPORT_HEADER( cudaForStorm + BASE_NAME cudaForStorm + EXPORT_MACRO_NAME cudaForStorm_EXPORT + EXPORT_FILE_NAME include/cudaForStorm_Export.h + STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC +) +#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) +#ADD_DEPENDENCIES(cudaForStorm cudaLibrary) +#target_link_libraries(cudaForStorm cudaLibrary) +message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") include_directories(${CUDA_INCLUDE_DIRS}) -############################################################# -## -## Gurobi (optional) -## -############################################################# -if (ENABLE_GUROBI) - message (STATUS "StoRM - Linking with Gurobi") - include_directories("${GUROBI_ROOT}/include") - target_link_libraries(cudaForStorm "gurobi56") -endif(ENABLE_GUROBI) - -############################################################# -## -## glpk (optional) -## -############################################################# -if (ENABLE_GLPK) - message (STATUS "StoRM - Linking with glpk") - target_link_libraries(cudaForStorm "glpk") -endif(ENABLE_GLPK) - -############################################################# -## -## Z3 (optional) -## -############################################################# -if (ENABLE_Z3) - message (STATUS "StoRM - Linking with Z3") - include_directories("${Z3_ROOT}/include") - target_link_libraries(cudaForStorm ${Z3_LIB_NAME}) -endif(ENABLE_Z3) - ############################################################# ## ## Threads @@ -333,7 +217,7 @@ endif(MSVC) # Link against libc++abi if requested. May be needed to build on Linux systems using clang. if (LINK_LIBCXXABI) - message (STATUS "StoRM - Linking against libc++abi.") + message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.") target_link_libraries(cudaForStorm "c++abi") endif(LINK_LIBCXXABI) diff --git a/resources/cudaForStorm/srcCuda/allCudaKernels.h b/resources/cudaForStorm/srcCuda/allCudaKernels.h index 1631b9104..182f1b770 100644 --- a/resources/cudaForStorm/srcCuda/allCudaKernels.h +++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h @@ -1,4 +1,5 @@ #include "utility.h" #include "bandWidth.h" #include "basicAdd.h" -#include "kernelSwitchTest.h" \ No newline at end of file +#include "kernelSwitchTest.h" +#include "basicValueIteration.h" \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu new file mode 100644 index 000000000..d22d289ec --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -0,0 +1,50 @@ +#include "basicValueIteration.h" + +#include +#include + +#include +#include "cusparse_v2.h" + + +__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { + *B = *A; +} + +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + std::cout << "basicValueIteration_mvReduce is implemented for ValueType == double :)" << std::endl; +} + +/* +void kernelSwitchTest(size_t N) { + int* deviceIntA; + int* deviceIntB; + + if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + + // Allocate space on the device + auto start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); + } + auto end_time = std::chrono::high_resolution_clock::now(); + std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + std::cout << "Resulting in " << (std::chrono::duration_cast(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; + + // Free memory on device + if (cudaFree(deviceIntA) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } + if (cudaFree(deviceIntB) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } +}*/ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h new file mode 100644 index 000000000..92bb44270 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -0,0 +1,4 @@ +#include +#include + +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file From da9fe04ba4c6e6099d6eab6faceba5e648677824 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sun, 2 Mar 2014 15:14:27 +0100 Subject: [PATCH 18/43] Removed the extra shell around the Cuda Plugin. Changed include pathes. Former-commit-id: c7fec9220dd2bfe1ff33ef10a98cd279b8ef018e --- CMakeLists.txt | 3 +- resources/cudaForStorm/CMakeLists.txt | 4 +- resources/cudaForStorm/src/cudaForStorm.h | 15 --- resources/cudaForStorm/src/cudaTests.h | 124 ------------------ resources/cudaForStorm/src/main.cpp | 69 ---------- .../srcCuda/basicValueIteration.cu | 6 +- .../srcCuda/basicValueIteration.h | 6 +- resources/cudaForStorm/srcCuda/cudaForStorm.h | 14 ++ src/storm.cpp | 3 + 9 files changed, 31 insertions(+), 213 deletions(-) delete mode 100644 resources/cudaForStorm/src/cudaForStorm.h delete mode 100644 resources/cudaForStorm/src/cudaTests.h delete mode 100644 resources/cudaForStorm/src/main.cpp create mode 100644 resources/cudaForStorm/srcCuda/cudaForStorm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b6058439..8d0e87995 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,7 +330,8 @@ target_link_libraries(storm-performance-tests ${Boost_LIBRARIES}) ############################################################# if (ENABLE_CUDAFORSTORM) message (STATUS "StoRM - Linking with CudaForStorm") - include_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/include") + include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include") + include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm") target_link_libraries(storm cudaForStorm) target_link_libraries(storm-functional-tests cudaForStorm) target_link_libraries(storm-performance-tests cudaForStorm) diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index 1ddfe128f..f3e679e92 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -183,7 +183,7 @@ include (GenerateExportHeader) ## ############################################################# #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) -cuda_add_library(cudaForStorm +cuda_add_library(cudaForStorm SHARED ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} OPTIONS -DSTUFF="" -arch=sm_30 RELEASE -DNDEBUG @@ -223,4 +223,4 @@ endif(LINK_LIBCXXABI) # Install Directive install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib") -install(FILES "${PROJECT_SOURCE_DIR}/src/cudaForStorm.h" "${PROJECT_BINARY_DIR}/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") \ No newline at end of file +install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") \ No newline at end of file diff --git a/resources/cudaForStorm/src/cudaForStorm.h b/resources/cudaForStorm/src/cudaForStorm.h deleted file mode 100644 index ebcb4eaa2..000000000 --- a/resources/cudaForStorm/src/cudaForStorm.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ -#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ - -// Library exports -#include "cudaForStorm_Export.h" - -/* - * List of exported functions in this library - */ - -cudaForStorm_EXPORT int cudaForStormTest(int value); - - - -#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/src/cudaTests.h b/resources/cudaForStorm/src/cudaTests.h deleted file mode 100644 index 2055953ed..000000000 --- a/resources/cudaForStorm/src/cudaTests.h +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include "srcCuda/allCudaKernels.h" - -#include -#include -#include - -void cudaShowDevices() { - // Todo -} - -void cudaSimpleAddTest(int a, int b) { - std::cout << "Running cudaSimpleAddTest:" << std::endl; - std::cout << "a = " << a << ", b = " << b << "" << std::endl; - - int c = cuda_basicAdd(a, b); - - std::cout << "Result: " << c << "" << std::endl; -} - -void cudaArrayFmaTest(int N) { - std::cout << "Running cudaArrayFmaTest:" << std::endl; - std::cout << "N is " << N << ", resulting in " << (5 * sizeof(int) * N) << " Bytes of Data." << std::endl; - - std::cout << "Generating random input arrays." << std::endl; - - std::default_random_engine generator; - std::uniform_int_distribution distribution(0, INT32_MAX); - int dice_roll = distribution(generator); - - auto start_time = std::chrono::high_resolution_clock::now(); - - int* arrayA = new int[N]; - int* arrayB = new int[N]; - int* arrayC = new int[N]; - int* arrayD = new int[N]; - int* arrayD_CPU = new int[N]; - - for (int i = 0; i < N; ++i) { - //arrayA[i] = distribution(generator); - //arrayB[i] = distribution(generator); - //arrayC[i] = distribution(generator); - arrayA[i] = i * 1000 + 137; - arrayB[i] = i * 7000 + 1537; - arrayC[i] = i * 15000 + 97; - arrayD[i] = 0; - arrayD_CPU[i] = 0; - } - - auto end_time = std::chrono::high_resolution_clock::now(); - std::cout << "Array generation took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - std::cout << "Running FMA test on CPU." << std::endl; - - start_time = std::chrono::high_resolution_clock::now(); - cuda_arrayFmaHelper(arrayA, arrayB, arrayC, arrayD_CPU, N); - end_time = std::chrono::high_resolution_clock::now(); - std::cout << "FMA on CPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - start_time = std::chrono::high_resolution_clock::now(); - cuda_arrayFma(arrayA, arrayB, arrayC, arrayD, N); - end_time = std::chrono::high_resolution_clock::now(); - std::cout << "FMA on GPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - int errors = 0; - for (int i = 0; i < N; ++i) { - if (arrayD[i] != arrayD_CPU[i]) { - std::cout << "Error in Entry " << i << ": GPU has " << arrayD[i] << " but CPU has " << arrayD_CPU[i] << "!" << std::endl; - ++errors; - } - } - std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; -} - -void cudaArrayFmaOptimizedTest(int N, int M) { - std::cout << "Running cudaArrayFmaTest:" << std::endl; - std::cout << "N is " << N << ", resulting in " << (4 * sizeof(int) * N) << " Bytes of Data." << std::endl; - - size_t freeCudaMemory = getFreeCudaMemory(); - size_t totalCudaMemory = getTotalCudaMemory(); - int freeProzent = static_cast(((double)freeCudaMemory)/((double)totalCudaMemory) * 100); - - std::cout << "CUDA Device has " << freeCudaMemory << " Bytes of " << totalCudaMemory << " Bytes free (" << (freeProzent) << "%)." << std::endl; - - std::cout << "Generating random input arrays." << std::endl; - - std::default_random_engine generator; - std::uniform_int_distribution distribution(0, INT32_MAX); - - auto start_time = std::chrono::high_resolution_clock::now(); - - int* arrayA = new int[4 * N]; - int* arrayA_CPU = new int[4 * N]; - - for (int i = 0; i < 4*N; ++i) { - arrayA[i] = i * 1000 + i + (357854878 % (i+1)); - arrayA_CPU[i] = arrayA[i]; - } - - auto end_time = std::chrono::high_resolution_clock::now(); - std::cout << "Array generation took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - start_time = std::chrono::high_resolution_clock::now(); - cuda_arrayFmaOptimizedHelper(arrayA_CPU, N); - end_time = std::chrono::high_resolution_clock::now(); - std::cout << "FMA on CPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - start_time = std::chrono::high_resolution_clock::now(); - cuda_arrayFmaOptimized(arrayA, N, M); - end_time = std::chrono::high_resolution_clock::now(); - std::cout << "FMA on GPU took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - - int errors = 0; - for (int i = 0; i < N; i+=4) { - if (arrayA[i+3] != arrayA_CPU[i+3]) { - //std::cout << "Error in Entry " << i << ": GPU has " << arrayA[i+3] << " but CPU has " << arrayA_CPU[i+3] << "!" << std::endl; - ++errors; - } - } - std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; - - delete[] arrayA; - delete[] arrayA_CPU; -} \ No newline at end of file diff --git a/resources/cudaForStorm/src/main.cpp b/resources/cudaForStorm/src/main.cpp deleted file mode 100644 index b555cd103..000000000 --- a/resources/cudaForStorm/src/main.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include "cudaForStorm.h" - -#include -#include - -#include -#include -#include - -#include "cudaTests.h" - -int cudaForStormTest(int value) { - return value + 42; -} - - -int main_Test12345(int argc, char **argv){ - resetCudaDevice(); - - int testNumber = 0; - int N = 10000; - int M = 402653184; - if (argc > 1) { - testNumber = atoi(argv[1]); - if (argc > 2) { - N = atoi(argv[2]); - if (argc > 3) { - M = atoi(argv[3]); - } - } - } - - switch (testNumber) { - case 1: - cudaSimpleAddTest(N, M); - break; - case 2: - cudaArrayFmaTest(N); - break; - case 3: - cudaArrayFmaOptimizedTest(N, M); - break; - case 4: - cpp_cuda_bandwidthTest(M, N); - break; - case 5: - kernelSwitchTest(N); - break; - break; - // DEFAULT AND 0 - case 0: - default: - std::cout << "Available functions are:" << std::endl; - std::cout << "0 - Show this overview" << std::endl; - std::cout << "1 - cuda simpleAddTest(N, M)" << std::endl; - std::cout << "2 - cuda arrayFmaTest(N)" << std::endl; - std::cout << "3 - cuda arrayFmaOptimizedTest(N, M)" << std::endl; - std::cout << "4 - cuda bandwidthTest(M, N)" << std::endl; - std::cout << "5 - cuda kernelSwitchTest(N)" << std::endl; - std::cout << std::endl; - std::cout << "Call: " << argv[0] << " Selection [N [M]]" << std::endl; - std::cout << "Defaults:" < #include -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file +// Library exports +#include "cudaForStorm_Export.h" + +cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); +cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h new file mode 100644 index 000000000..05e08b987 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h @@ -0,0 +1,14 @@ +#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ +#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ + +/* + * List of exported functions in this library + */ + +// TopologicalValueIteration +#include "srcCuda/basicValueIteration.h" + + + + +#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ \ No newline at end of file diff --git a/src/storm.cpp b/src/storm.cpp index 69cb0c454..183a7b139 100644 --- a/src/storm.cpp +++ b/src/storm.cpp @@ -62,6 +62,8 @@ #include "src/exceptions/InvalidSettingsException.h" +#include "cudaForStorm.h" + #include #include #include @@ -160,6 +162,7 @@ void printHeader(const int argc, const char* argv[]) { } std::cout << "Command line: " << commandStream.str() << std::endl; std::cout << "Current working directory: " << getCurrentWorkingDirectory() << std::endl << std::endl; + cudaForStormTestFunction(21, 21); } /*! From 9388cd158ce78314a862c62bde1d5c7f9faad245 Mon Sep 17 00:00:00 2001 From: PBerger Date: Fri, 7 Mar 2014 02:59:50 +0100 Subject: [PATCH 19/43] Implementations, implementations. Former-commit-id: e203636fac87dc78f486b17bfc2e507cf365e5ba --- CMakeLists.txt | 10 +- .../srcCuda/basicValueIteration.cu | 130 +++++++++++++++++- .../srcCuda/basicValueIteration.h | 3 +- 3 files changed, 136 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d0e87995..60f3426e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ option(ENABLE_INTELTBB "Sets whether the Intel TBB is available." OFF) option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precompiled headers)." OFF) option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) -option(ENABLE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF) +option(STORM_USE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF) set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") @@ -180,7 +180,7 @@ endif() set(STORM_CPP_GLPK_DEF "define") # CUDA Defines -if (ENABLE_CUDAFORSTORM) +if (STORM_USE_CUDAFORSTORM) set(STORM_CPP_CUDAFORSTORM_DEF "define") else() set(STORM_CPP_CUDAFORSTORM_DEF "undef") @@ -289,7 +289,7 @@ endif() if (ENABLE_Z3) link_directories("${Z3_ROOT}/bin") endif() -if (ENABLE_CUDAFORSTORM) +if (STORM_USE_CUDAFORSTORM) link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib") endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) @@ -328,14 +328,14 @@ target_link_libraries(storm-performance-tests ${Boost_LIBRARIES}) ## CUDA For Storm ## ############################################################# -if (ENABLE_CUDAFORSTORM) +if (STORM_USE_CUDAFORSTORM) message (STATUS "StoRM - Linking with CudaForStorm") include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include") include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm") target_link_libraries(storm cudaForStorm) target_link_libraries(storm-functional-tests cudaForStorm) target_link_libraries(storm-performance-tests cudaForStorm) -endif(ENABLE_CUDAFORSTORM) +endif(STORM_USE_CUDAFORSTORM) ############################################################# ## diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 18f1f7b02..80bb87e57 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -15,8 +15,136 @@ void cudaForStormTestFunction(int a, int b) { std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; } -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + if (sizeof(double) != sizeof(uint_fast64_t)) { + std::cout << "FATAL ERROR - Internal Sizes of Double and uint_fast64_t do NOT match, CUDA acceleration not possible!" << std::endl; + return; + } + uint_fast64_t* device_matrixRowIndices = nullptr; + uint_fast64_t* device_matrixColIndicesAndValues = nullptr; + double* device_x = nullptr; + double* device_b = nullptr; + double* device_multiplyResult = nullptr; + uint_fast64_t* device_nondeterministicChoiceIndices = nullptr; + + cudaError_t cudaMallocResult; + + cudaMallocResult = cudaMalloc(&device_matrixRowIndices, matrixRowIndices.size()); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + cudaMallocResult = cudaMalloc(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + cudaMallocResult = cudaMalloc(&device_x, x.size()); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + cudaMallocResult = cudaMalloc(&device_b, b.size()); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + cudaMallocResult = cudaMalloc(&device_multiplyResult, b.size()); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + cudaMallocResult = cudaMalloc(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size()); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(uint_fast64_t) * matrixRowIndices.size(), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(uint_fast64_t) * columnIndicesAndValues.size()) + (sizeof(double) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(double) * x.size(), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(double) * b.size(), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(uint_fast64_t) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // Data is on device, start Kernel + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_matrixRowIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixRowIndices = nullptr; + } + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixColIndicesAndValues = nullptr; + } + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_b != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_b); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + } + device_b = nullptr; + } + if (device_multiplyResult != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; + } + device_multiplyResult = nullptr; + } + if (device_nondeterministicChoiceIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_nondeterministicChoiceIndices = nullptr; + } } /* diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 88e8a92e5..fc5a8a322 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -1,8 +1,9 @@ #include #include +#include // Library exports #include "cudaForStorm_Export.h" cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); -cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector const& matrixColumnIndices, std::vector const& matrixValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file +cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file From a964846e2dd0d7f3eb5166fd8936bfb33d65ba94 Mon Sep 17 00:00:00 2001 From: PBerger Date: Fri, 7 Mar 2014 23:54:18 +0100 Subject: [PATCH 20/43] Added cusplibrary as a git submodule. Former-commit-id: 152764c8f376f351fa1eab7fa44e0c1586c8d4a9 --- .gitmodules | 3 ++ resources/3rdparty/cusplibrary | 1 + resources/cmake/FindThrust.cmake | 53 +++++++++++++++++++++++++++ resources/cudaForStorm/CMakeLists.txt | 9 +++++ 4 files changed, 66 insertions(+) create mode 100644 .gitmodules create mode 160000 resources/3rdparty/cusplibrary create mode 100644 resources/cmake/FindThrust.cmake diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..92b35f18f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "resources/3rdparty/cusplibrary"] + path = resources/3rdparty/cusplibrary + url = https://github.com/cusplibrary/cusplibrary.git diff --git a/resources/3rdparty/cusplibrary b/resources/3rdparty/cusplibrary new file mode 160000 index 000000000..d8d7d9e97 --- /dev/null +++ b/resources/3rdparty/cusplibrary @@ -0,0 +1 @@ +Subproject commit d8d7d9e97add8db08ef0ad5c0a7e9929fd83ce3c diff --git a/resources/cmake/FindThrust.cmake b/resources/cmake/FindThrust.cmake new file mode 100644 index 000000000..9ad65b7e8 --- /dev/null +++ b/resources/cmake/FindThrust.cmake @@ -0,0 +1,53 @@ +# +# FindThrust +# +# This module finds the Thrust header files and extrats their version. It +# sets the following variables. +# +# THRUST_INCLUDE_DIR - Include directory for thrust header files. (All header +# files will actually be in the thrust subdirectory.) +# THRUST_VERSION - Version of thrust in the form "major.minor.patch". +# + +find_path( THRUST_INCLUDE_DIR + HINTS + /usr/include/cuda + /usr/local/include + /usr/local/cuda/include + ${CUDA_INCLUDE_DIRS} + NAMES thrust/version.h + DOC "Thrust headers" + ) +if( THRUST_INCLUDE_DIR ) + list( REMOVE_DUPLICATES THRUST_INCLUDE_DIR ) +endif( THRUST_INCLUDE_DIR ) + +# Find thrust version +file( STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h + version + REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)" + ) +string( REGEX REPLACE + "#define THRUST_VERSION[ \t]+" + "" + version + "${version}" + ) + +string( REGEX MATCH "^[0-9]" major ${version} ) +string( REGEX REPLACE "^${major}00" "" version "${version}" ) +string( REGEX MATCH "^[0-9]" minor ${version} ) +string( REGEX REPLACE "^${minor}0" "" version "${version}" ) +set( THRUST_VERSION "${major}.${minor}.${version}") +set( THRUST_MAJOR_VERSION "${major}") +set( THRUST_MINOR_VERSION "${minor}") + +# Check for required components +include( FindPackageHandleStandardArgs ) +find_package_handle_standard_args( Thrust + REQUIRED_VARS THRUST_INCLUDE_DIR + VERSION_VAR THRUST_VERSION + ) + +set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR}) +mark_as_advanced(THRUST_INCLUDE_DIR) \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index f3e679e92..acedd56d1 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -38,6 +38,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") find_package(CUDA REQUIRED) find_package(Doxygen REQUIRED) find_package(Threads REQUIRED) +find_package(Thrust REQUIRED) # If the DEBUG option was turned on, we will target a debug version and a release version otherwise if (CUDAFORSTORM_DEBUG) @@ -209,6 +210,14 @@ include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${THREADS_INCLUDE_DIRS}) target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT}) +############################################################# +## +## Thrust +## +############################################################# +include_directories(${THRUST_INCLUDE_DIR}) +message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION}") + if (MSVC) # Add the DebugHelper DLL set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") From 71e077f420397eff251ed24365a8bdb075f4342d Mon Sep 17 00:00:00 2001 From: PBerger Date: Sat, 8 Mar 2014 18:28:39 +0100 Subject: [PATCH 21/43] Compiles with CUSP :) Former-commit-id: 78555303bf9615a97c25127082b89c89d53748ec --- resources/cmake/FindCusp.cmake | 55 +++++++++++ resources/cmake/FindThrust.cmake | 69 +++++++------- .../cudaForStorm/CMakeAlignmentCheck.cpp | 64 +++++++++++++ resources/cudaForStorm/CMakeLists.txt | 93 ++++++++++++------- .../srcCuda/basicValueIteration.cu | 87 ++++++----------- .../srcCuda/basicValueIteration.h | 2 +- .../cudaForStorm/srcCuda/cuspExtension.h | 83 +++++++++++++++++ .../cudaForStorm/storm-cudaplugin-config.h.in | 13 +++ 8 files changed, 337 insertions(+), 129 deletions(-) create mode 100644 resources/cmake/FindCusp.cmake create mode 100644 resources/cudaForStorm/CMakeAlignmentCheck.cpp create mode 100644 resources/cudaForStorm/srcCuda/cuspExtension.h create mode 100644 resources/cudaForStorm/storm-cudaplugin-config.h.in diff --git a/resources/cmake/FindCusp.cmake b/resources/cmake/FindCusp.cmake new file mode 100644 index 000000000..9520d1426 --- /dev/null +++ b/resources/cmake/FindCusp.cmake @@ -0,0 +1,55 @@ +# +# FindCusp +# +# This module finds the CUSP header files and extracts their version. It +# sets the following variables. +# +# CUSP_INCLUDE_DIR - Include directory for cusp header files. (All header +# files will actually be in the cusp subdirectory.) +# CUSP_VERSION - Version of cusp in the form "major.minor.patch". +# +# CUSP_FOUND - Indicates whether Cusp has been found +# + +find_path(CUSP_INCLUDE_DIR + HINTS + /usr/include/cusp + /usr/local/include + /usr/local/cusp/include + ${CUSP_INCLUDE_DIRS} + ${CUSP_HINT} + NAMES cusp/version.h + DOC "Cusp headers" +) +if(CUSP_INCLUDE_DIR) + list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR) +endif(CUSP_INCLUDE_DIR) + +# Find cusp version +file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h + version + REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)" +) +string(REGEX REPLACE + "#define CUSP_VERSION[ \t]+" + "" + version + "${version}" +) + +#define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000) +#define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000) +#define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100) + +math(EXPR CUSP_MAJOR_VERSION "${version} / 100000") +math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000") +math(EXPR CUSP_PATCH_VERSION "${version} % 100") + +set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}") + +# Check for required components +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION) + +set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR}) +mark_as_advanced(CUSP_INCLUDE_DIR) \ No newline at end of file diff --git a/resources/cmake/FindThrust.cmake b/resources/cmake/FindThrust.cmake index 9ad65b7e8..8f811bda3 100644 --- a/resources/cmake/FindThrust.cmake +++ b/resources/cmake/FindThrust.cmake @@ -1,53 +1,52 @@ # # FindThrust # -# This module finds the Thrust header files and extrats their version. It +# This module finds the Thrust header files and extracts their version. It # sets the following variables. # # THRUST_INCLUDE_DIR - Include directory for thrust header files. (All header # files will actually be in the thrust subdirectory.) # THRUST_VERSION - Version of thrust in the form "major.minor.patch". # +# Thrust_FOUND - Indicates whether Thrust has been found +# -find_path( THRUST_INCLUDE_DIR - HINTS - /usr/include/cuda - /usr/local/include - /usr/local/cuda/include - ${CUDA_INCLUDE_DIRS} - NAMES thrust/version.h - DOC "Thrust headers" - ) -if( THRUST_INCLUDE_DIR ) - list( REMOVE_DUPLICATES THRUST_INCLUDE_DIR ) -endif( THRUST_INCLUDE_DIR ) +find_path(THRUST_INCLUDE_DIR + HINTS + /usr/include/cuda + /usr/local/include + /usr/local/cuda/include + ${CUDA_INCLUDE_DIRS} + NAMES thrust/version.h + DOC "Thrust headers" +) +if(THRUST_INCLUDE_DIR) + list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR) +endif(THRUST_INCLUDE_DIR) # Find thrust version -file( STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h - version - REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)" - ) -string( REGEX REPLACE - "#define THRUST_VERSION[ \t]+" - "" - version - "${version}" - ) +file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h + version + REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)" +) +string(REGEX REPLACE + "#define THRUST_VERSION[ \t]+" + "" + version + "${version}" +) -string( REGEX MATCH "^[0-9]" major ${version} ) -string( REGEX REPLACE "^${major}00" "" version "${version}" ) -string( REGEX MATCH "^[0-9]" minor ${version} ) -string( REGEX REPLACE "^${minor}0" "" version "${version}" ) -set( THRUST_VERSION "${major}.${minor}.${version}") -set( THRUST_MAJOR_VERSION "${major}") -set( THRUST_MINOR_VERSION "${minor}") +string(REGEX MATCH "^[0-9]" major ${version}) +string(REGEX REPLACE "^${major}00" "" version "${version}") +string(REGEX MATCH "^[0-9]" minor ${version}) +string(REGEX REPLACE "^${minor}0" "" version "${version}") +set(THRUST_VERSION "${major}.${minor}.${version}") +set(THRUST_MAJOR_VERSION "${major}") +set(THRUST_MINOR_VERSION "${minor}") # Check for required components -include( FindPackageHandleStandardArgs ) -find_package_handle_standard_args( Thrust - REQUIRED_VARS THRUST_INCLUDE_DIR - VERSION_VAR THRUST_VERSION - ) +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION) set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR}) mark_as_advanced(THRUST_INCLUDE_DIR) \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeAlignmentCheck.cpp b/resources/cudaForStorm/CMakeAlignmentCheck.cpp new file mode 100644 index 000000000..1dc9b470b --- /dev/null +++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp @@ -0,0 +1,64 @@ +/* + * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code. + */ + #include + #include + #include + + #define CONTAINER_SIZE 100ul + + template + int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) { + std::vector>* myVector = new std::vector>(); + for (size_t i = 0; i < containerSize; ++i) { + myVector->push_back(std::make_pair(firstValue, secondValue)); + } + size_t myVectorSize = myVector->size(); + IndexType* firstStart = &(myVector->at(0).first); + IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first); + ValueType* secondStart = &(myVector->at(0).second); + ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second); + size_t startOffset = reinterpret_cast(secondStart) - reinterpret_cast(firstStart); + size_t endOffset = reinterpret_cast(secondEnd) - reinterpret_cast(firstEnd); + size_t firstOffset = reinterpret_cast(firstEnd) - reinterpret_cast(firstStart); + size_t secondOffset = reinterpret_cast(secondEnd) - reinterpret_cast(secondStart); + + delete myVector; + myVector = nullptr; + + if (myVectorSize != containerSize) { + return -2; + } + + // Check for alignment: + // Requirement is that the pairs are aligned like: first, second, first, second, first, second, ... + if (sizeof(IndexType) != sizeof(ValueType)) { + return -3; + } + if (startOffset != sizeof(IndexType)) { + return -4; + } + if (endOffset != sizeof(IndexType)) { + return -5; + } + if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { + return -6; + } + if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { + return -7; + } + + return 0; + } + + + int main(int argc, char* argv[]) { + int result = 0; + + result = checkForAlignmentOfPairTypes(CONTAINER_SIZE, 42, 3.14); + if (result != 0) { + return result; + } + + return 0; + } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index acedd56d1..6b0496d51 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -35,9 +35,12 @@ set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE # Add the resources/cmake folder to Module Search Path for FindTBB.cmake set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") +# Set the hint for CUSP +set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary") + find_package(CUDA REQUIRED) +find_package(Cusp REQUIRED) find_package(Doxygen REQUIRED) -find_package(Threads REQUIRED) find_package(Thrust REQUIRED) # If the DEBUG option was turned on, we will target a debug version and a release version otherwise @@ -115,10 +118,23 @@ endif() ## ############################################################# +# Test for type alignment +try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT + ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp" + COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR +) +if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT) + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") +elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0) + message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") +endif() + # Configure a header file to pass some of the CMake settings to the source code configure_file ( - "${PROJECT_SOURCE_DIR}/../../storm-config.h.in" - "${PROJECT_BINARY_DIR}/include/storm-config.h" + "${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in" + "${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h" ) # Add the binary dir include directory for storm-config.h include_directories("${PROJECT_BINARY_DIR}/include") @@ -161,22 +177,6 @@ endif(ADDITIONAL_LINK_DIRS) ############################################################# -############################################################################### -## # -## Executable Creation # -## # -## All link_directories() calls MUST be made before this point # -## # -############################################################################### -include (GenerateExportHeader) - -#add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) -#GENERATE_EXPORT_HEADER( cudaForStorm -# BASE_NAME cudaForStorm -# EXPORT_MACRO_NAME cudaForStorm_EXPORT -# EXPORT_FILE_NAME include/cudaForStorm_Export.h -# STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC -#) ############################################################# ## @@ -184,18 +184,6 @@ include (GenerateExportHeader) ## ############################################################# #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) -cuda_add_library(cudaForStorm SHARED - ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} - OPTIONS -DSTUFF="" -arch=sm_30 - RELEASE -DNDEBUG - DEBUG -g -DDEBUG -) -GENERATE_EXPORT_HEADER( cudaForStorm - BASE_NAME cudaForStorm - EXPORT_MACRO_NAME cudaForStorm_EXPORT - EXPORT_FILE_NAME include/cudaForStorm_Export.h - STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC -) #target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) #ADD_DEPENDENCIES(cudaForStorm cudaLibrary) #target_link_libraries(cudaForStorm cudaLibrary) @@ -204,19 +192,52 @@ include_directories(${CUDA_INCLUDE_DIRS}) ############################################################# ## -## Threads +## CUSP ## ############################################################# -include_directories(${THREADS_INCLUDE_DIRS}) -target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT}) +if(CUSP_FOUND) + include_directories(${CUSP_INCLUDE_DIR}) + cuda_include_directories(${CUSP_INCLUDE_DIR}) + message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!") +endif() ############################################################# ## ## Thrust ## ############################################################# -include_directories(${THRUST_INCLUDE_DIR}) -message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION}") +if(THRUST_FOUND) + include_directories(${THRUST_INCLUDE_DIR}) + cuda_include_directories(${THRUST_INCLUDE_DIR}) + message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.") +endif() + +############################################################################### +## # +## Executable Creation # +## # +## All link_directories() calls AND include_directories() calls # +## MUST be made before this point # +## # +############################################################################### +include (GenerateExportHeader) + +cuda_add_library(cudaForStorm SHARED + ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} + OPTIONS -DSTUFF="" -arch=sm_30 + RELEASE -DNDEBUG + DEBUG -g -DDEBUG +) +GENERATE_EXPORT_HEADER( cudaForStorm + BASE_NAME cudaForStorm + EXPORT_MACRO_NAME cudaForStorm_EXPORT + EXPORT_FILE_NAME include/cudaForStorm_Export.h + STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC +) if (MSVC) # Add the DebugHelper DLL diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 80bb87e57..f23e7ed09 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -6,61 +6,54 @@ #include #include "cusparse_v2.h" +#include "cuspExtension.h" __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { *B = *A; } -void cudaForStormTestFunction(int a, int b) { - std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; -} - -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - if (sizeof(double) != sizeof(uint_fast64_t)) { - std::cout << "FATAL ERROR - Internal Sizes of Double and uint_fast64_t do NOT match, CUDA acceleration not possible!" << std::endl; - return; - } - - uint_fast64_t* device_matrixRowIndices = nullptr; - uint_fast64_t* device_matrixColIndicesAndValues = nullptr; - double* device_x = nullptr; - double* device_b = nullptr; - double* device_multiplyResult = nullptr; - uint_fast64_t* device_nondeterministicChoiceIndices = nullptr; +template +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; cudaError_t cudaMallocResult; - cudaMallocResult = cudaMalloc(&device_matrixRowIndices, matrixRowIndices.size()); + cudaMallocResult = cudaMalloc(&device_matrixRowIndices, matrixRowIndices.size()); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2); + cudaMallocResult = cudaMalloc(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_x, x.size()); + cudaMallocResult = cudaMalloc(&device_x, x.size()); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_b, b.size()); + cudaMallocResult = cudaMalloc(&device_b, b.size()); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_multiplyResult, b.size()); + cudaMallocResult = cudaMalloc(&device_multiplyResult, b.size()); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size()); + cudaMallocResult = cudaMalloc(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size()); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; @@ -69,31 +62,31 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve // Memory allocated, copy data to device cudaError_t cudaCopyResult; - cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(uint_fast64_t) * matrixRowIndices.size(), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(uint_fast64_t) * columnIndicesAndValues.size()) + (sizeof(double) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(double) * x.size(), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(double) * b.size(), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(uint_fast64_t) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; @@ -101,6 +94,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve // Data is on device, start Kernel + // All code related to freeing memory and clearing up the device cleanup: if (device_matrixRowIndices != nullptr) { @@ -148,35 +142,14 @@ cleanup: } /* -void kernelSwitchTest(size_t N) { - int* deviceIntA; - int* deviceIntB; + * Declare and implement all exported functions for these Kernels here + * + */ - if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; - return; - } - if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; - return; - } - - // Allocate space on the device - auto start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; ++i) { - cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); - } - auto end_time = std::chrono::high_resolution_clock::now(); - std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - std::cout << "Resulting in " << (std::chrono::duration_cast(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; +void cudaForStormTestFunction(int a, int b) { + std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; +} - // Free memory on device - if (cudaFree(deviceIntA) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - return; - } - if (cudaFree(deviceIntB) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - return; - } -}*/ \ No newline at end of file +void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + basicValueIteration_mvReduce(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index fc5a8a322..1316e5014 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -6,4 +6,4 @@ #include "cudaForStorm_Export.h" cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); -cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h new file mode 100644 index 000000000..238b3aa36 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -0,0 +1,83 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & past, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +template +void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +template +void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +template +void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in new file mode 100644 index 000000000..d59532a6c --- /dev/null +++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in @@ -0,0 +1,13 @@ +/* + * StoRM - Build-in Options + * + * This file is parsed by CMake during makefile generation + * It contains information such as the base path to the test/example data + */ + +#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ +#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ + + + +#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ From c0a7e424869c3ebddb65e3a3e3a7963f5763567e Mon Sep 17 00:00:00 2001 From: PBerger Date: Tue, 11 Mar 2014 01:41:57 +0100 Subject: [PATCH 22/43] Implemented a basic but complete kernel for value iteration in CUDA. It doesnt work :( Former-commit-id: 6a3a7aa505f91adb7034d35f2e0700b59b05718d --- .../srcCuda/basicValueIteration.cu | 168 +++++++++++- .../srcCuda/basicValueIteration.h | 8 +- resources/cudaForStorm/srcCuda/cudaForStorm.h | 3 + .../cudaForStorm/srcCuda/cuspExtension.h | 257 ++++++++++++++++++ resources/cudaForStorm/srcCuda/utility.cu | 18 +- resources/cudaForStorm/srcCuda/utility.h | 15 +- ...onNondeterministicLinearEquationSolver.cpp | 24 +- src/storage/SparseMatrix.h | 6 + 8 files changed, 477 insertions(+), 22 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index f23e7ed09..31fb8d4ba 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -1,4 +1,5 @@ #include "basicValueIteration.h" +#define CUSP_USE_TEXTURE_MEMORY #include #include @@ -6,54 +7,111 @@ #include #include "cusparse_v2.h" +#include "utility.h" + #include "cuspExtension.h" +#include +#include +#include + + +#define CUDA_CHECK_ALL_ERRORS() do { \ + cudaError_t errSync = cudaGetLastError(); \ + cudaError_t errAsync = cudaDeviceSynchronize(); \ + if (errSync != cudaSuccess) { \ + std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \ + } \ + if (errAsync != cudaSuccess) { \ + std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ + } } while(false) __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { *B = *A; } -template -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +template +struct equalModuloPrecision : public thrust::binary_function +{ +__host__ __device__ T operator()(const T &x, const T &y) const +{ + if (Relative) { + const T result = (x - y) / y; + return (result > 0) ? result : -result; + } else { + const T result = (x - y); + return (result > 0) ? result : -result; + } +} +}; + +template +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; ValueType* device_multiplyResult = nullptr; IndexType* device_nondeterministicChoiceIndices = nullptr; + std::cout.sync_with_stdio(true); + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); + std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; + + const IndexType matrixRowCount = matrixRowIndices.size() - 1; + const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; + const IndexType matrixNnzCount = columnIndicesAndValues.size(); + cudaError_t cudaMallocResult; - cudaMallocResult = cudaMalloc(&device_matrixRowIndices, matrixRowIndices.size()); + bool converged = false; + uint_fast64_t iterationCount = 0; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_x, x.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_b, b.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_xSwap), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_multiplyResult, b.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixRowCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; @@ -62,38 +120,99 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve // Memory allocated, copy data to device cudaError_t cudaCopyResult; - cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice); + // Preset the xSwap to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice); + // Preset the multiplyResult to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; } // Data is on device, start Kernel + while (!converged && iterationCount < maxIterationCount) + { // In a sub-area since transfer of control via label evades initialization + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::device_ptr devicePtrThrust_multiplyResult(device_multiplyResult); + + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + // Check for convergence + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_x_end(device_x + matrixColCount); + thrust::device_ptr devicePtrThrust_xSwap(device_xSwap); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + // Reduce: get Max over x and check for res < Precision + ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + converged = maxX < precision; + ++iterationCount; + + // Swap pointers, device_x always contains the most current result + std::swap(device_x, device_xSwap); + } + std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; + + // Get x back from the device + cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } // All code related to freeing memory and clearing up the device cleanup: @@ -118,6 +237,13 @@ cleanup: } device_x = nullptr; } + if (device_xSwap != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_xSwap); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; + } + device_xSwap = nullptr; + } if (device_b != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_b); if (cudaFreeResult != cudaSuccess) { @@ -150,6 +276,18 @@ void cudaForStormTestFunction(int a, int b) { std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; } -void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - basicValueIteration_mvReduce(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); +void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + if (relativePrecisionCheck) { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } else { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } +} + +void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + if (relativePrecisionCheck) { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } else { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 1316e5014..61529d963 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -1,3 +1,6 @@ +#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ +#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ + #include #include #include @@ -6,4 +9,7 @@ #include "cudaForStorm_Export.h" cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); + +#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h index 05e08b987..fdc484eaa 100644 --- a/resources/cudaForStorm/srcCuda/cudaForStorm.h +++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h @@ -8,6 +8,9 @@ // TopologicalValueIteration #include "srcCuda/basicValueIteration.h" +// Utility Functions +#include "srcCuda/utility.h" + diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 238b3aa36..4b13005f3 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -25,6 +25,8 @@ #pragma once #include +#include +#include namespace cusp { @@ -33,6 +35,261 @@ namespace detail namespace device { +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) +{ + __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; + + const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(IndexType row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + ValueType sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + } + else + { + // accumulate local sums + for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer) +{ + __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2]; + + const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index + const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(IndexType row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + ValueType localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); + } else { + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) +{ + ValueType __minMaxInitializer = 0; + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const ValueType minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) +{ + const IndexType rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +template +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +template +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + template void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { diff --git a/resources/cudaForStorm/srcCuda/utility.cu b/resources/cudaForStorm/srcCuda/utility.cu index 9366453f9..99165ba07 100644 --- a/resources/cudaForStorm/srcCuda/utility.cu +++ b/resources/cudaForStorm/srcCuda/utility.cu @@ -1,3 +1,7 @@ +#include "utility.h" + +#include + size_t getFreeCudaMemory() { size_t freeMemory; size_t totalMemory; @@ -14,6 +18,16 @@ size_t getTotalCudaMemory() { return totalMemory; } -void resetCudaDevice() { - cudaDeviceReset(); +bool resetCudaDevice() { + cudaError_t result = cudaDeviceReset(); + return (result == cudaSuccess); +} + +int getRuntimeCudaVersion() { + int result = -1; + cudaError_t errorResult = cudaRuntimeGetVersion(&result); + if (errorResult != cudaSuccess) { + return -1; + } + return result; } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.h b/resources/cudaForStorm/srcCuda/utility.h index ed25af9b6..f3110fbeb 100644 --- a/resources/cudaForStorm/srcCuda/utility.h +++ b/resources/cudaForStorm/srcCuda/utility.h @@ -1,3 +1,12 @@ -size_t getFreeCudaMemory(); -size_t getTotalCudaMemory(); -void resetCudaDevice(); \ No newline at end of file +#ifndef STORM_CUDAFORSTORM_UTILITY_H_ +#define STORM_CUDAFORSTORM_UTILITY_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +cudaForStorm_EXPORT size_t getFreeCudaMemory(); +cudaForStorm_EXPORT size_t getTotalCudaMemory(); +cudaForStorm_EXPORT bool resetCudaDevice(); +cudaForStorm_EXPORT int getRuntimeCudaVersion(); + +#endif // STORM_CUDAFORSTORM_UTILITY_H_ \ No newline at end of file diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index f6c4473fa..14f94f807 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -13,6 +13,9 @@ #include "log4cplus/loggingmacros.h" extern log4cplus::Logger logger; +#include "storm-config.h" +#include "cudaForStorm.h" + namespace storm { namespace solver { @@ -80,6 +83,7 @@ namespace storm { for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + std::cout << "SCC Index: " << *sccIndexIt << std::endl; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); @@ -121,6 +125,7 @@ namespace storm { } // For the current SCC, we need to perform value iteration until convergence. +#ifndef STORM_HAVE_CUDAFORSTORM localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { @@ -157,6 +162,23 @@ namespace storm { ++localIterations; ++globalIterations; } + std::cout << "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations." << std::endl; +#else + if (!resetCudaDevice()) { + std::cout << "Could not reset CUDA Device!" << std::endl; + } + std::cout << "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; + size_t memSize = sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size(); + std::cout << "We will allocate " << memSize << " Bytes." << std::endl; + std::cout << "The CUDA Runtime Version is " << getRuntimeCudaVersion() << std::endl; + + if (minimize) { + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + } + else { + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + } +#endif // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; @@ -165,7 +187,7 @@ namespace storm { ++innerIndex; } - // Since the pointers for swapping in the calculation point to temps they should not be valide anymore + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore currentX = nullptr; swap = nullptr; diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index dbb80745b..cac8ae586 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -19,6 +19,11 @@ namespace storm { class EigenAdapter; class StormAdapter; } + + namespace solver { + template + class TopologicalValueIterationNondeterministicLinearEquationSolver; + } } namespace storm { @@ -142,6 +147,7 @@ namespace storm { friend class storm::adapters::GmmxxAdapter; friend class storm::adapters::EigenAdapter; friend class storm::adapters::StormAdapter; + friend class storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver; typedef typename std::vector>::iterator iterator; typedef typename std::vector>::const_iterator const_iterator; From e45fa5a82ccd6c8f6608ca7b6caf063c08b2f6b5 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 12 Mar 2014 23:31:24 +0100 Subject: [PATCH 23/43] Added a Test for the CUDA Plugin. Added accessors for the SparseMatrix as I need access to the internal vectors. Added a pure SPMV Kernel interface to check the kernel for errors. Former-commit-id: 46e1449eeb4993de24a753d07a08b240a1465021 --- .../srcCuda/basicValueIteration.cu | 123 ++++++++++++++++++ .../srcCuda/basicValueIteration.h | 1 + .../cudaForStorm/srcCuda/cuspExtension.h | 2 +- src/storage/SparseMatrix.cpp | 16 +++ src/storage/SparseMatrix.h | 9 ++ test/functional/solver/CudaPluginTest.cpp | 65 +++++++++ 6 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 test/functional/solver/CudaPluginTest.cpp diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 31fb8d4ba..712dfac09 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -267,6 +267,125 @@ cleanup: } } +template +void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_multiplyResult = nullptr; + + std::cout.sync_with_stdio(true); + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); + std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; + + const IndexType matrixRowCount = matrixRowIndices.size() - 1; + const IndexType matrixNnzCount = columnIndicesAndValues.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // Preset the multiplyResult to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_matrixRowIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixRowIndices = nullptr; + } + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixColIndicesAndValues = nullptr; + } + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_multiplyResult != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; + } + device_multiplyResult = nullptr; + } +} + /* * Declare and implement all exported functions for these Kernels here * @@ -276,6 +395,10 @@ void cudaForStormTestFunction(int a, int b) { std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; } +void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); +} + void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { if (relativePrecisionCheck) { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 61529d963..2395c0311 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -11,5 +11,6 @@ cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 4b13005f3..34e6e6e14 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -2,7 +2,7 @@ * This is an extension of the original CUSP csr_vector.h SPMV implementation. * It is based on the Code and incorporates changes as to cope with the details * of the StoRM code. - * As this is mostly copy & past, the original license still applies. + * As this is mostly copy & paste, the original license still applies. */ /* diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index 1ba121550..b831ae9df 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -797,6 +797,22 @@ namespace storm { } return true; } + + /*! + * Returns a reference to the internal rowMapping vector + */ + template + std::vector const& SparseMatrix::__internal_getRowIndications() { + return this->rowIndications; + } + + /*! + * Returns a reference to the internal columnMapping vector + */ + template + std::vector> const& SparseMatrix::__internal_getColumnsAndValues() { + return this->columnsAndValues; + } template std::ostream& operator<<(std::ostream& out, SparseMatrix const& matrix) { diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index cac8ae586..973fa79dd 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -583,6 +583,15 @@ namespace storm { * @return size_t A hash value for this matrix. */ std::size_t hash() const; + + /*! + * Returns a reference to the internal rowMapping vector + */ + std::vector const& __internal_getRowIndications(); + /*! + * Returns a reference to the internal columnMapping vector + */ + std::vector> const& __internal_getColumnsAndValues(); private: // The number of rows of the matrix. diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp new file mode 100644 index 000000000..a59697af8 --- /dev/null +++ b/test/functional/solver/CudaPluginTest.cpp @@ -0,0 +1,65 @@ +#include "gtest/gtest.h" +#include "src/storage/SparseMatrix.h" +#include "src/exceptions/InvalidStateException.h" +#include "src/exceptions/OutOfRangeException.h" + +#include "storm-config.h" + +#ifdef STORM_HAVE_CUDAFORSTORM + +#include "cudaForStorm.h" + +TEST(CudaPlugin, CreationWithDimensions) { + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0)); + + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(10, matrix.getEntryCount()); + + std::vector x({0, 4, 1, 1}); + std::vector b({0, 0, 0, 0}); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 3); + ASSERT_EQ(b.at(1), 25); + ASSERT_EQ(b.at(2), 16); + ASSERT_EQ(b.at(3), 0); +} + +TEST(CudaPlugin, VerySmall) { + storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(2, matrix.getRowCount()); + ASSERT_EQ(2, matrix.getColumnCount()); + ASSERT_EQ(2, matrix.getEntryCount()); + + std::vector x({ 4.0, 8.0 }); + std::vector b({ 0.0, 0.0 }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 4.0); + ASSERT_EQ(b.at(1), 16.0); +} + +#endif \ No newline at end of file From 208005e68bc535a178a668699ac7cfadf6be8b61 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 13 Mar 2014 20:23:47 +0100 Subject: [PATCH 24/43] Added Tests to the Cuda Plugin. Refactored kernel for SpMV to use two vectors for column indexes and values. Former-commit-id: 3560d3cc9a8b0addc36f20704c771f41c7476f3e --- .../srcCuda/basicValueIteration.cu | 372 ++++++++++++++++-- .../srcCuda/basicValueIteration.h | 6 +- .../cudaForStorm/srcCuda/cuspExtension.h | 38 +- test/functional/solver/CudaPluginTest.cpp | 122 +++++- 4 files changed, 485 insertions(+), 53 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 712dfac09..e42f36eda 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -25,9 +25,6 @@ std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ } } while(false) -__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { - *B = *A; -} template struct equalModuloPrecision : public thrust::binary_function @@ -36,18 +33,34 @@ __host__ __device__ T operator()(const T &x, const T &y) const { if (Relative) { const T result = (x - y) / y; - return (result > 0) ? result : -result; + return ((result >= 0) ? (result) : (-result)); } else { const T result = (x - y); - return (result > 0) ? result : -result; + return ((result >= 0) ? (result) : (-result)); } } }; +template +void exploadVector(std::vector> const& inputVector, std::vector& indexVector, std::vector& valueVector) { + indexVector.reserve(inputVector.size()); + valueVector.reserve(inputVector.size()); + for (size_t i = 0; i < inputVector.size(); ++i) { + indexVector.push_back(inputVector.at(i).first); + valueVector.push_back(inputVector.at(i).second); + } +} + template void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + + std::vector matrixColumnIndices; + std::vector matrixValues; + exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -76,9 +89,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -128,9 +148,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } @@ -174,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -197,11 +224,14 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy CUDA_CHECK_ALL_ERRORS(); // Reduce: get Max over x and check for res < Precision - ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum()); + ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits::max(), thrust::maximum()); CUDA_CHECK_ALL_ERRORS(); - converged = maxX < precision; + converged = (maxX < precision); ++iterationCount; + // If there are empty rows in the matrix we need to clear multiplyResult + thrust::fill(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, 0); + // Swap pointers, device_x always contains the most current result std::swap(device_x, device_xSwap); } @@ -223,12 +253,19 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (device_matrixColIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixColIndices = nullptr; + } + if (device_matrixValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndicesAndValues = nullptr; + device_matrixValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -269,8 +306,13 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + std::vector matrixColumnIndices; + std::vector matrixValues; + exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -292,9 +334,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -323,9 +372,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); // Get result back from the device @@ -363,12 +419,19 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (device_matrixColIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndicesAndValues = nullptr; + device_matrixColIndices = nullptr; + } + if (device_matrixValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -386,19 +449,266 @@ cleanup: } } +template +void basicValueIteration_addVectorsInplace(std::vector& a, std::vector const& b) { + ValueType* device_a = nullptr; + ValueType* device_b = nullptr; + + const size_t vectorSize = std::max(a.size(), b.size()); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_a), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::device_ptr devicePtrThrust_a(device_a); + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_a != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_a); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; + } + device_a = nullptr; + } + if (device_b != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_b); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + } + device_b = nullptr; + } +} + +template +void basicValueIteration_reduceGroupedVector(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + ValueType* device_groupedVector = nullptr; + IndexType* device_grouping = nullptr; + ValueType* device_target = nullptr; + + const size_t groupedSize = groupedVector.size(); + const size_t groupingSize = grouping.size(); + const size_t targetSize = targetVector.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_groupedVector), sizeof(ValueType) * groupedSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_grouping), sizeof(IndexType) * groupingSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_target), sizeof(ValueType) * targetSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_groupedVector != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_groupedVector); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; + } + device_groupedVector = nullptr; + } + if (device_grouping != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_grouping); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; + } + device_grouping = nullptr; + } + if (device_target != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_target); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; + } + device_target = nullptr; + } +} + +template +void basicValueIteration_equalModuloPrecision(std::vector const& x, std::vector const& y, ValueType& maxElement) { + ValueType* device_x = nullptr; + ValueType* device_y = nullptr; + + const size_t vectorSize = x.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_y), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_y(device_y); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: get Max over x and check for res < Precision + maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits::max(), thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_y != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_y); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; + } + device_y = nullptr; + } +} + /* * Declare and implement all exported functions for these Kernels here * */ -void cudaForStormTestFunction(int a, int b) { - std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; -} - void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); } +void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { if (relativePrecisionCheck) { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 2395c0311..ad4ce0fc9 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -8,9 +8,13 @@ // Library exports #include "cudaForStorm_Export.h" -cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 34e6e6e14..f07849816 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -61,7 +61,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y) { __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; @@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType // accumulate local sums if(jj >= row_start && jj < row_end) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); // accumulate local sums for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); } else { // accumulate local sums for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); } // store local sum in shared memory @@ -214,7 +214,7 @@ storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * template void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { - ValueType __minMaxInitializer = 0; + ValueType __minMaxInitializer = -std::numeric_limits::max(); if (Minimize) { __minMaxInitializer = std::numeric_limits::max(); } @@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_ } template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const size_t THREADS_PER_BLOCK = 128; const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; @@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType bind_x(x); storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); if (UseCache) unbind_x(x); } template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); } template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); } // NON-OPT diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp index a59697af8..3d2a69c84 100644 --- a/test/functional/solver/CudaPluginTest.cpp +++ b/test/functional/solver/CudaPluginTest.cpp @@ -9,7 +9,7 @@ #include "cudaForStorm.h" -TEST(CudaPlugin, CreationWithDimensions) { +TEST(CudaPlugin, SpMV_4x4) { storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0)); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0)); @@ -41,7 +41,7 @@ TEST(CudaPlugin, CreationWithDimensions) { ASSERT_EQ(b.at(3), 0); } -TEST(CudaPlugin, VerySmall) { +TEST(CudaPlugin, SpMV_VerySmall) { storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); @@ -62,4 +62,122 @@ TEST(CudaPlugin, VerySmall) { ASSERT_EQ(b.at(1), 16.0); } +TEST(CudaPlugin, AddVectorsInplace) { + std::vector vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + double cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + double cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + +TEST(CudaPlugin, ReduceGroupedVector) { + std::vector groupedVector = { + 0.0, -1000.0, 0.000004, // Group 0 + 5.0, // Group 1 + 0.0, 1.0, 2.0, 3.0, // Group 2 + -1000.0, -3.14, -0.0002,// Group 3 (neg only) + 25.25, 25.25, 25.25, // Group 4 + 0.0, 0.0, 1.0, // Group 5 + -0.000001, 0.000001 // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0, // Group 0 + 5.0, + 0.0, + -1000.0, + 25.25, + 0.0, + -0.000001 + }; + std::vector result_maximize = { + 0.000004, + 5.0, + 3.0, + -0.0002, + 25.25, + 1.0, + 0.000001 + }; + + std::vector result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + +TEST(CudaPlugin, equalModuloPrecision) { + std::vector x = { + 123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L + }; + std::vector y1 = { + 0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L + }; + std::vector y2 = { + 0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0); + x3.push_back(-(1000.0 - static_cast(i))); + y4.push_back(1.0); + } + + double maxElement1 = 0.0L; + double maxElement2 = 0.0L; + double maxElement3 = 0.0L; + double maxElement4 = 0.0L; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0L, maxElement1); + ASSERT_DOUBLE_EQ(901.0L, maxElement2); + + ASSERT_DOUBLE_EQ(998.0L, maxElement3); + ASSERT_DOUBLE_EQ(1001.0L, maxElement4); +} + #endif \ No newline at end of file From b63a6179d8489fecd9956d8ff434fc27f4217195 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 13 Mar 2014 23:47:42 +0100 Subject: [PATCH 25/43] Fixed a possible bug in the equalModuloPrecision comparison of vectors. Same for the CUDA Kernel, but there all hell broke free. Former-commit-id: 6cb21c391962fda161a51bff543877ef701317f8 --- .../cudaForStorm/srcCuda/basicValueIteration.cu | 12 +++++++----- src/utility/vector.h | 5 ++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index e42f36eda..25541adbc 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -14,7 +14,7 @@ #include #include - +#ifdef DEBUG #define CUDA_CHECK_ALL_ERRORS() do { \ cudaError_t errSync = cudaGetLastError(); \ cudaError_t errAsync = cudaDeviceSynchronize(); \ @@ -24,7 +24,9 @@ if (errAsync != cudaSuccess) { \ std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ } } while(false) - +#else +#define CUDA_CHECK_ALL_ERRORS() do {} while (false) +#endif template struct equalModuloPrecision : public thrust::binary_function @@ -32,6 +34,9 @@ struct equalModuloPrecision : public thrust::binary_function __host__ __device__ T operator()(const T &x, const T &y) const { if (Relative) { + if (y == 0) { + return x; + } const T result = (x - y) / y; return ((result >= 0) ? (result) : (-result)); } else { @@ -229,9 +234,6 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy converged = (maxX < precision); ++iterationCount; - // If there are empty rows in the matrix we need to clear multiplyResult - thrust::fill(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, 0); - // Swap pointers, device_x always contains the most current result std::swap(device_x, device_xSwap); } diff --git a/src/utility/vector.h b/src/utility/vector.h index dd3bb2a1f..6527d7b19 100644 --- a/src/utility/vector.h +++ b/src/utility/vector.h @@ -330,7 +330,10 @@ namespace storm { template bool equalModuloPrecision(T const& val1, T const& val2, T precision, bool relativeError = true) { if (relativeError) { - if (std::abs(val1 - val2)/val2 > precision) return false; + if (val2 == 0) { + return (val1 > precision); + } + if (std::abs((val1 - val2)/val2) > precision) return false; } else { if (std::abs(val1 - val2) > precision) return false; } From d3f513b0a0a62484e438bbd157d1241fb734deba Mon Sep 17 00:00:00 2001 From: PBerger Date: Sat, 15 Mar 2014 18:28:56 +0100 Subject: [PATCH 26/43] Added debug output to CUDA Kernel. Added a performance test for the CUDA stuff. Former-commit-id: 9953befdea6d5235fe6dc8cec95028fcc062d46c --- .../srcCuda/basicValueIteration.cu | 11 +- ...onNondeterministicLinearEquationSolver.cpp | 80 ++++++-- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 171 ++++++++++++++++++ 3 files changed, 243 insertions(+), 19 deletions(-) create mode 100644 test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 25541adbc..d3c412ddf 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -58,11 +58,10 @@ void exploadVector(std::vector> const& inputVect template void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - std::vector matrixColumnIndices; std::vector matrixValues; exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); - + IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndices = nullptr; ValueType* device_matrixValues = nullptr; @@ -72,10 +71,12 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy ValueType* device_multiplyResult = nullptr; IndexType* device_nondeterministicChoiceIndices = nullptr; +#ifdef DEBUG std::cout.sync_with_stdio(true); std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; +#endif const IndexType matrixRowCount = matrixRowIndices.size() - 1; const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; @@ -237,7 +238,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Swap pointers, device_x always contains the most current result std::swap(device_x, device_xSwap); } +#ifdef DEBUG std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; +#endif // Get x back from the device cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); @@ -311,17 +314,19 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector matrixColumnIndices; std::vector matrixValues; exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); - + IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndices = nullptr; ValueType* device_matrixValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; +#ifdef DEBUG std::cout.sync_with_stdio(true); std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; +#endif const IndexType matrixRowCount = matrixRowIndices.size() - 1; const IndexType matrixNnzCount = columnIndicesAndValues.size(); diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 14f94f807..f50e8caf0 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -8,6 +8,7 @@ #include "src/models/PseudoModel.h" #include "src/storage/StronglyConnectedComponentDecomposition.h" #include "src/exceptions/IllegalArgumentException.h" +#include "src/exceptions/InvalidStateException.h" #include "log4cplus/logger.h" #include "log4cplus/loggingmacros.h" @@ -83,7 +84,6 @@ namespace storm { for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - std::cout << "SCC Index: " << *sccIndexIt << std::endl; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); @@ -125,7 +125,24 @@ namespace storm { } // For the current SCC, we need to perform value iteration until convergence. -#ifndef STORM_HAVE_CUDAFORSTORM +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } + + LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + + std::vector copyX(*currentX); + if (minimize) { + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); + } + else { + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); + } + localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { @@ -139,7 +156,7 @@ namespace storm { /* Versus: A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); */ // Reduce the vector x' by applying min/max for all non-deterministic choices. @@ -162,22 +179,53 @@ namespace storm { ++localIterations; ++globalIterations; } - std::cout << "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations." << std::endl; -#else - if (!resetCudaDevice()) { - std::cout << "Could not reset CUDA Device!" << std::endl; - } - std::cout << "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; - size_t memSize = sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size(); - std::cout << "We will allocate " << memSize << " Bytes." << std::endl; - std::cout << "The CUDA Runtime Version is " << getRuntimeCudaVersion() << std::endl; + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); - if (minimize) { - basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + uint_fast64_t diffCount = 0; + for (size_t i = 0; i < currentX->size(); ++i) { + if (currentX->at(i) != copyX.at(i)) { + LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); + std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; + } } - else { - basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); +#else + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + + // Update environment variables. + std::swap(currentX, swap); + + ++localIterations; + ++globalIterations; } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); #endif // The Result of this SCC has to be taken back into the main result vector diff --git a/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp new file mode 100644 index 000000000..2f0e35393 --- /dev/null +++ b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -0,0 +1,171 @@ +#include "gtest/gtest.h" +#include "storm-config.h" + +#include "src/settings/Settings.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/parser/AutoParser.h" + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew"); + + ASSERT_EQ(parser.getType(), storm::models::MDP); + + std::shared_ptr> mdp = parser.getModel>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 2095783ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 7714385ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 6.172433512), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 6.1724344), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Consensus) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + // Increase the maximal number of iterations, because the solver does not converge otherwise. + // This is done in the main cpp unit + + storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", ""); + + ASSERT_EQ(parser.getType(), storm::models::MDP); + + std::shared_ptr> mdp = parser.getModel>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 63616ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 213472ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::Ap* apFormula2 = new storm::property::prctl::Ap("all_coins_equal_0"); + storm::property::prctl::And* andFormula = new storm::property::prctl::And(apFormula, apFormula2); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.4374282832), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + apFormula2 = new storm::property::prctl::Ap("all_coins_equal_1"); + andFormula = new storm::property::prctl::And(apFormula, apFormula2); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.5293286369), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + apFormula2 = new storm::property::prctl::Ap("agree"); + storm::property::prctl::Not* notFormula = new storm::property::prctl::Not(apFormula2); + andFormula = new storm::property::prctl::And(apFormula, notFormula); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.10414097), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 50ull); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 50ull); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[31168] - 1725.593313), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[31168] - 2183.142422), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} \ No newline at end of file From cd46a6b0c64a2fb3e7abaf10603c0fe20eb31cb8 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sat, 15 Mar 2014 22:06:49 +0100 Subject: [PATCH 27/43] Fixed a bug in the equalModuloPrecision function. Former-commit-id: 465d90b4a7ad83cce9d0d18cbac3785946db18e7 --- src/utility/vector.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/utility/vector.h b/src/utility/vector.h index dd3bb2a1f..f8a56382a 100644 --- a/src/utility/vector.h +++ b/src/utility/vector.h @@ -330,6 +330,9 @@ namespace storm { template bool equalModuloPrecision(T const& val1, T const& val2, T precision, bool relativeError = true) { if (relativeError) { + if (val2 == 0) { + return (std::abs(val1) <= precision); + } if (std::abs(val1 - val2)/val2 > precision) return false; } else { if (std::abs(val1 - val2) > precision) return false; From 05814f5d737dcde92ad8a93b8aa63b7f3ecec20e Mon Sep 17 00:00:00 2001 From: PBerger Date: Sun, 16 Mar 2014 17:37:58 +0100 Subject: [PATCH 28/43] Fixed a bug in the equalModuloPrecision function of the CUDA Kernel Added more debug output to the CUDA handler functions Added a function for grouping of SCCs for better performance Added functionality and accessors to the SparseMatrix Former-commit-id: 770aec1b09ed0bbcc00bc2bc9913b8525d5bb94c --- .../srcCuda/basicValueIteration.cu | 62 ++++- .../srcCuda/basicValueIteration.h | 1 + ...onNondeterministicLinearEquationSolver.cpp | 259 ++++++++++++------ ...tionNondeterministicLinearEquationSolver.h | 12 +- src/storage/SparseMatrix.cpp | 9 + src/storage/SparseMatrix.h | 7 + 6 files changed, 257 insertions(+), 93 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index d3c412ddf..ff93d29b6 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -35,7 +35,7 @@ __host__ __device__ T operator()(const T &x, const T &y) const { if (Relative) { if (y == 0) { - return x; + return ((x >= 0) ? (x) : (-x)); } const T result = (x - y) / y; return ((result >= 0) ? (result) : (-result)); @@ -73,6 +73,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #ifdef DEBUG std::cout.sync_with_stdio(true); + std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; @@ -143,6 +144,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished allocating memory." << std::endl; +#endif + // Memory allocated, copy data to device cudaError_t cudaCopyResult; @@ -204,6 +209,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; +#endif + // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization @@ -239,6 +248,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy std::swap(device_x, device_xSwap); } #ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; #endif @@ -249,6 +259,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } +#ifdef DEBUG + std::cout << "(DLL) Finished copying result data." << std::endl; +#endif + // All code related to freeing memory and clearing up the device cleanup: if (device_matrixRowIndices != nullptr) { @@ -307,6 +321,9 @@ cleanup: } device_nondeterministicChoiceIndices = nullptr; } +#ifdef DEBUG + std::cout << "(DLL) Finished cleanup." << std::endl; +#endif } template @@ -323,6 +340,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; @@ -368,6 +386,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); +#ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; +#endif + // Get result back from the device cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); if (cudaCopyResult != cudaSuccess) { @@ -417,6 +447,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector @@ -730,4 +767,27 @@ void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max } else { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); } +} + +size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(double); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + + return (matrixDataSize + vectorSizes); } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index ad4ce0fc9..f23cbec28 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -8,6 +8,7 @@ // Library exports #include "cudaForStorm_Export.h" +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index f50e8caf0..6807a77fd 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -42,12 +42,12 @@ namespace storm { } template - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult, std::vector* newX) const { + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { // Now, we need to determine the SCCs of the MDP and a topological sort. //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); - + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); @@ -60,6 +60,9 @@ namespace storm { storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + // Calculate the optimal distribution of sccs + std::vector>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + // Set up the environment for the power method. // bool multiplyResultMemoryProvided = true; // if (multiplyResult == nullptr) { @@ -82,12 +85,13 @@ namespace storm { // solved after all SCCs it depends on have been solved. int counter = 0; - for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + std::vector const& scc = sccIndexIt->second; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); std::vector sccSubB(sccSubmatrix.getRowCount()); storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); std::vector sccSubX(sccSubmatrix.getColumnCount()); @@ -125,108 +129,115 @@ namespace storm { } // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } - - LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - - std::vector copyX(*currentX); - if (minimize) { - basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); - } - else { - basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); - } - - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ + LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - // Reduce the vector x' by applying min/max for all non-deterministic choices. + std::vector copyX(*currentX); if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); } else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); } + converged = true; + + // DEBUG + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - - // Update environment variables. - std::swap(currentX, swap); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - ++localIterations; - ++globalIterations; - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + // Update environment variables. + std::swap(currentX, swap); - uint_fast64_t diffCount = 0; - for (size_t i = 0; i < currentX->size(); ++i) { - if (currentX->at(i) != copyX.at(i)) { - LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); - std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; - } - } -#else - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); - - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + ++localIterations; + ++globalIterations; } - else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + + uint_fast64_t diffCount = 0; + for (size_t i = 0; i < currentX->size(); ++i) { + if (currentX->at(i) != copyX.at(i)) { + LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); + std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; + ++diffCount; + } } + std::cout << "CUDA solution differed in " << diffCount << " of " << currentX->size() << " values." << std::endl; +#endif + } else { + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - // Update environment variables. - std::swap(currentX, swap); + // Update environment variables. + std::swap(currentX, swap); - ++localIterations; - ++globalIterations; + ++localIterations; + ++globalIterations; + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); -#endif + // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; @@ -263,6 +274,72 @@ namespace storm { } } + template + std::vector>> + TopologicalValueIterationNondeterministicLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { + std::vector>> result; +#ifdef STORM_HAVE_CUDAFORSTORM + // 95% to have a bit of padding + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); + size_t lastResultIndex = 0; + + std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + size_t currentSize = 0; + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + + uint_fast64_t rowCount = 0; + uint_fast64_t entryCount = 0; + std::vector rowGroups; + rowGroups.reserve(scc.size()); + + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowCount += matrix.getRowGroupSize(*sccIt); + entryCount += matrix.getRowGroupEntryCount(*sccIt); + rowGroups.push_back(*sccIt); + } + + size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); + + if ((currentSize + sccSize) <= cudaFreeMemory) { + // There is enough space left in the current group + + if (currentSize == 0) { + result.push_back(std::make_pair(true, rowGroups)); + } + else { + result[lastResultIndex].second.insert(result[lastResultIndex].second.end(), rowGroups.begin(), rowGroups.end()); + } + currentSize += sccSize; + } + else { + if (sccSize <= cudaFreeMemory) { + ++lastResultIndex; + result.push_back(std::make_pair(true, rowGroups)); + currentSize = sccSize; + } + else { + // This group is too big to fit into the CUDA Memory by itself + lastResultIndex += 2; + result.push_back(std::make_pair(false, rowGroups)); + currentSize = 0; + } + } + } +#else + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + std::vector rowGroups; + rowGroups.reserve(scc.size()); + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowGroups.push_back(*sccIt); + result.push_back(std::make_pair(false, rowGroups)); + } + } +#endif + return result; + } + // Explicitly instantiate the solver. template class TopologicalValueIterationNondeterministicLinearEquationSolver; } // namespace solver diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 86262e062..84aba15fe 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -2,6 +2,11 @@ #define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ #include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/storage/StronglyConnectedComponentDecomposition.h" +#include "src/storage/SparseMatrix.h" + +#include +#include namespace storm { namespace solver { @@ -30,7 +35,12 @@ namespace storm { virtual NondeterministicLinearEquationSolver* clone() const override; - virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + private: + /*! + * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. + */ + std::vector>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; }; } // namespace solver } // namespace storm diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index 69100e6bb..3aee26996 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -330,6 +330,15 @@ namespace storm { return entryCount; } + template + uint_fast64_t SparseMatrix::getRowGroupEntryCount(uint_fast64_t const group) const { + uint_fast64_t result = 0; + for (uint_fast64_t row = this->getRowGroupIndices()[group]; row < this->getRowGroupIndices()[group + 1]; ++row) { + result += (this->rowIndications[row + 1] - this->rowIndications[row]); + } + return result; + } + template uint_fast64_t SparseMatrix::getRowGroupCount() const { return rowGroupIndices.size() - 1; diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index f55b55b90..ae3a87abd 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -344,6 +344,13 @@ namespace storm { * @return The number of entries in the matrix. */ uint_fast64_t getEntryCount() const; + + /*! + * Returns the number of entries in the given row group of the matrix. + * + * @return The number of entries in the given row group of the matrix. + */ + uint_fast64_t getRowGroupEntryCount(uint_fast64_t const group) const; /*! * Returns the number of row groups in the matrix. From 0922921b24a0331f4170a475780e32c1caec334f Mon Sep 17 00:00:00 2001 From: PBerger Date: Mon, 17 Mar 2014 05:44:15 +0100 Subject: [PATCH 29/43] Updated cudaForStorm/CMakeLists.txt to make use of the new GIT based version schema. Added version functions to the Cuda Plugin. Edited storm.cpp to show version infos for the CUDA Plugin. Fixed a critical error in basicValueIteration.cu which causes random SEGFAULTs... :P Streamlined the TopologicalValueIterationNondeterministicLinearEquationSolver.cpp. The SCC group optimizer now returns flat_sets instead of a vector as the sets are ordered, which is required for the Solver to work. This is now a stable version of StoRM containing a fully wor Former-commit-id: 47d5c2825caafacb361a6d20f73a8db1567f6eba --- resources/cudaForStorm/CMakeLists.txt | 22 +++- .../cudaForStorm/srcCuda/allCudaKernels.h | 3 +- .../srcCuda/basicValueIteration.cu | 4 +- resources/cudaForStorm/srcCuda/cudaForStorm.h | 2 + resources/cudaForStorm/srcCuda/version.cu | 28 +++++ resources/cudaForStorm/srcCuda/version.h | 16 +++ .../cudaForStorm/storm-cudaplugin-config.h.in | 8 +- ...onNondeterministicLinearEquationSolver.cpp | 109 ++++-------------- ...tionNondeterministicLinearEquationSolver.h | 2 +- src/storm.cpp | 19 ++- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 44 ++++++- 11 files changed, 157 insertions(+), 100 deletions(-) create mode 100644 resources/cudaForStorm/srcCuda/version.cu create mode 100644 resources/cudaForStorm/srcCuda/version.h diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index 6b0496d51..7bc37a097 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -130,7 +130,27 @@ elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0) else() message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") endif() - + +# +# Make a version file containing the current version from git. +# +include(GetGitRevisionDescription) +git_describe_checkout(STORM_GIT_VERSION_STRING) +# Parse the git Tag into variables +string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}") +if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$") + set(STORM_CUDAPLUGIN_VERSION_DIRTY 1) +else() + set(STORM_CUDAPLUGIN_VERSION_DIRTY 0) +endif() +message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})") + + # Configure a header file to pass some of the CMake settings to the source code configure_file ( "${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in" diff --git a/resources/cudaForStorm/srcCuda/allCudaKernels.h b/resources/cudaForStorm/srcCuda/allCudaKernels.h index 182f1b770..50bf92191 100644 --- a/resources/cudaForStorm/srcCuda/allCudaKernels.h +++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h @@ -2,4 +2,5 @@ #include "bandWidth.h" #include "basicAdd.h" #include "kernelSwitchTest.h" -#include "basicValueIteration.h" \ No newline at end of file +#include "basicValueIteration.h" +#include "version.h" \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index ff93d29b6..231c6c4be 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -138,7 +138,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixRowCount + 1)); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; @@ -203,7 +203,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h index fdc484eaa..2ea39c2d0 100644 --- a/resources/cudaForStorm/srcCuda/cudaForStorm.h +++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h @@ -11,6 +11,8 @@ // Utility Functions #include "srcCuda/utility.h" +// Version Information +#include "srcCuda/version.h" diff --git a/resources/cudaForStorm/srcCuda/version.cu b/resources/cudaForStorm/srcCuda/version.cu new file mode 100644 index 000000000..3850c895c --- /dev/null +++ b/resources/cudaForStorm/srcCuda/version.cu @@ -0,0 +1,28 @@ +#include "version.h" + +#include "storm-cudaplugin-config.h" + +size_t getStormCudaPluginVersionMajor() { + return STORM_CUDAPLUGIN_VERSION_MAJOR; +} + +size_t getStormCudaPluginVersionMinor() { + return STORM_CUDAPLUGIN_VERSION_MINOR; +} + +size_t getStormCudaPluginVersionPatch() { + return STORM_CUDAPLUGIN_VERSION_PATCH; +} + +size_t getStormCudaPluginVersionCommitsAhead() { + return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD; +} + +const char* getStormCudaPluginVersionHash() { + static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH; + return versionHash.c_str(); +} + +bool getStormCudaPluginVersionIsDirty() { + return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0); +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/version.h b/resources/cudaForStorm/srcCuda/version.h new file mode 100644 index 000000000..de3f4f16c --- /dev/null +++ b/resources/cudaForStorm/srcCuda/version.h @@ -0,0 +1,16 @@ +#ifndef STORM_CUDAFORSTORM_VERSION_H_ +#define STORM_CUDAFORSTORM_VERSION_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +#include + +cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead(); +cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash(); +cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty(); + +#endif // STORM_CUDAFORSTORM_VERSION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in index d59532a6c..1cfc9119e 100644 --- a/resources/cudaForStorm/storm-cudaplugin-config.h.in +++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in @@ -8,6 +8,12 @@ #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ #define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ - +// Version Information +#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@ // The major version of StoRM +#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@ // The minor version of StoRM +#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@ // The patch version of StoRM +#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ // How many commits passed since the tag was last set +#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on +#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 6807a77fd..cdc468936 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,6 +1,7 @@ #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include +#include #include "src/settings/Settings.h" #include "src/utility/vector.h" @@ -15,7 +16,9 @@ extern log4cplus::Logger logger; #include "storm-config.h" -#include "cudaForStorm.h" +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif namespace storm { namespace solver { @@ -44,12 +47,9 @@ namespace storm { template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - // Now, we need to determine the SCCs of the MDP and a topological sort. - //std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); - //storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); + // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - //storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(*static_cast*>(&pseudoModel), false, false); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); if (sccDecomposition.size() == 0) { @@ -61,20 +61,10 @@ namespace storm { std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); // Calculate the optimal distribution of sccs - std::vector>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); - - // Set up the environment for the power method. -// bool multiplyResultMemoryProvided = true; -// if (multiplyResult == nullptr) { -// multiplyResult = new std::vector(A.getRowCount()); -// multiplyResultMemoryProvided = false; -// } + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + std::vector* currentX = nullptr; - //bool xMemoryProvided = true; - //if (newX == nullptr) { - // newX = new std::vector(x.size()); - // xMemoryProvided = false; - //} std::vector* swap = nullptr; uint_fast64_t currentMaxLocalIterations = 0; uint_fast64_t localIterations = 0; @@ -87,7 +77,7 @@ namespace storm { for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { bool const useGpu = sccIndexIt->first; - std::vector const& scc = sccIndexIt->second; + storm::storage::StateBlock const& scc = sccIndexIt->second; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); @@ -140,63 +130,16 @@ namespace storm { LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - std::vector copyX(*currentX); if (minimize) { - basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); } else { - basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices); + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); } converged = true; - - // DEBUG - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); - - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - - // Update environment variables. - std::swap(currentX, swap); - - ++localIterations; - ++globalIterations; - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); - - uint_fast64_t diffCount = 0; - for (size_t i = 0; i < currentX->size(); ++i) { - if (currentX->at(i) != copyX.at(i)) { - LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i)); - std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl; - ++diffCount; - } - } - std::cout << "CUDA solution differed in " << diffCount << " of " << currentX->size() << " values." << std::endl; +#else + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; #endif } else { localIterations = 0; @@ -256,14 +199,6 @@ namespace storm { currentMaxLocalIterations = localIterations; } } - - //if (!xMemoryProvided) { - // delete newX; - //} - -// if (!multiplyResultMemoryProvided) { -// delete multiplyResult; -// } // Check if the solver converged and issue a warning otherwise. if (converged) { @@ -275,9 +210,9 @@ namespace storm { } template - std::vector>> + std::vector> TopologicalValueIterationNondeterministicLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { - std::vector>> result; + std::vector> result; #ifdef STORM_HAVE_CUDAFORSTORM // 95% to have a bit of padding size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); @@ -290,13 +225,13 @@ namespace storm { uint_fast64_t rowCount = 0; uint_fast64_t entryCount = 0; - std::vector rowGroups; + storm::storage::StateBlock rowGroups; rowGroups.reserve(scc.size()); for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { rowCount += matrix.getRowGroupSize(*sccIt); entryCount += matrix.getRowGroupEntryCount(*sccIt); - rowGroups.push_back(*sccIt); + rowGroups.insert(*sccIt); } size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); @@ -308,7 +243,7 @@ namespace storm { result.push_back(std::make_pair(true, rowGroups)); } else { - result[lastResultIndex].second.insert(result[lastResultIndex].second.end(), rowGroups.begin(), rowGroups.end()); + result[lastResultIndex].second.insert(rowGroups.begin(), rowGroups.end()); } currentSize += sccSize; } @@ -329,12 +264,12 @@ namespace storm { #else for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - std::vector rowGroups; + storm::storage::StateBlock rowGroups; rowGroups.reserve(scc.size()); for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { - rowGroups.push_back(*sccIt); - result.push_back(std::make_pair(false, rowGroups)); + rowGroups.insert(*sccIt); } + result.push_back(std::make_pair(false, rowGroups)); } #endif return result; diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 84aba15fe..40d9df354 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -40,7 +40,7 @@ namespace storm { /*! * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. */ - std::vector>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; + std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; }; } // namespace solver } // namespace storm diff --git a/src/storm.cpp b/src/storm.cpp index 636606049..cd1ec951c 100644 --- a/src/storm.cpp +++ b/src/storm.cpp @@ -75,6 +75,9 @@ #ifdef STORM_HAVE_Z3 # include "z3.h" #endif +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif #include #include @@ -173,8 +176,20 @@ void printHeader(const int argc, const char* argv[]) { if (STORM_CPP_VERSION_DIRTY == 1) { std::cout << " (DIRTY)"; } - std::cout << std::endl; - + std::cout << "." << std::endl; + +#ifdef STORM_HAVE_CUDAFORSTORM + std::cout << "Compiled with Runtime Support for the StoRM CUDA Plugin." << std::endl; + std::cout << "Detected the StoRM CUDA Plugin in Version " << getStormCudaPluginVersionMajor() << "." << getStormCudaPluginVersionMinor() << "." << getStormCudaPluginVersionPatch(); + if (getStormCudaPluginVersionCommitsAhead() != 0) { + std::cout << " (+" << getStormCudaPluginVersionCommitsAhead() << " commits)"; + } + std::cout << " build from revision " << getStormCudaPluginVersionHash(); + if (getStormCudaPluginVersionIsDirty()) { + std::cout << " (DIRTY)"; + } + std::cout << "." << std::endl; +#endif #ifdef STORM_HAVE_INTELTBB std::cout << "Linked with Intel Threading Building Blocks v" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " (Interface version " << TBB_INTERFACE_VERSION << ")." << std::endl; #endif diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index f39d17022..811b9532d 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -7,6 +7,8 @@ #include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" #include "src/parser/AutoParser.h" +#include "storm-config.h" + TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); @@ -86,8 +88,12 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); result = mc.checkNoBoundOperator(*rewardFormula); - + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -95,8 +101,12 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); result = mc.checkNoBoundOperator(*rewardFormula);; - + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; storm::parser::AutoParser stateRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", ""); @@ -112,8 +122,12 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); - + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -122,7 +136,11 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; storm::parser::AutoParser stateAndTransitionRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); @@ -139,7 +157,11 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 14.6666581), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; apFormula = new storm::property::prctl::Ap("done"); @@ -148,7 +170,11 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else ASSERT_LT(std::abs(result[0] - 14.666663), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; } @@ -209,17 +235,25 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); - result = mc.checkNoBoundOperator(*rewardFormula);; + result = mc.checkNoBoundOperator(*rewardFormula); +#ifdef STORM_HAVE_CUDAFORSTORM ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 4.285701547), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; apFormula = new storm::property::prctl::Ap("elected"); reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); - result = mc.checkNoBoundOperator(*rewardFormula);; + result = mc.checkNoBoundOperator(*rewardFormula); +#ifdef STORM_HAVE_CUDAFORSTORM ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 4.285703591), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif delete rewardFormula; } From 26500ff4a8f0f4a841803d62aa71e7cfbde93ee6 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 19 Mar 2014 03:54:31 +0100 Subject: [PATCH 30/43] Refactored the CUDA Kernel to once again use the "hacked" combination of column indices and values with a bit of reinterpret_cast magic. Refactored the CUDA-SCC grouping algorithm as is took 80x longer to calculate the groups than it took to calculate the entire solution. Former-commit-id: 5a5ffabe388f9220bb900420e771a43f256d93b4 --- .../srcCuda/basicValueIteration.cu | 94 +++++-------------- .../cudaForStorm/srcCuda/cuspExtension.h | 36 +++---- ...onNondeterministicLinearEquationSolver.cpp | 91 ++++++++++++------ 3 files changed, 104 insertions(+), 117 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 231c6c4be..b7a879e8b 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -58,13 +58,12 @@ void exploadVector(std::vector> const& inputVect template void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - std::vector matrixColumnIndices; - std::vector matrixValues; - exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + //std::vector matrixColumnIndices; + //std::vector matrixValues; + //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -96,16 +95,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -159,16 +151,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl; + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } @@ -216,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -272,19 +257,12 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndices = nullptr; - } - if (device_matrixValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixValues); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixValues = nullptr; + device_matrixColIndicesAndValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -328,13 +306,8 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - std::vector matrixColumnIndices; - std::vector matrixValues; - exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); - IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -359,16 +332,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -401,16 +367,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); #ifdef DEBUG @@ -460,19 +419,12 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixColIndices = nullptr; - } - if (device_matrixValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixValues); + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixValues = nullptr; + device_matrixColIndicesAndValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index f07849816..7c9a9fb28 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -61,7 +61,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y) +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) { __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; @@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType // accumulate local sums if(jj >= row_start && jj < row_end) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); // accumulate local sums for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); } else { // accumulate local sums for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); } // store local sum in shared memory @@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_ } template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const size_t THREADS_PER_BLOCK = 128; const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; @@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType bind_x(x); storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); if (UseCache) unbind_x(x); } template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } // NON-OPT diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index cdc468936..fa68468ec 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,7 +1,6 @@ #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include -#include #include "src/settings/Settings.h" #include "src/utility/vector.h" @@ -46,7 +45,6 @@ namespace storm { template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); @@ -219,57 +217,94 @@ namespace storm { size_t lastResultIndex = 0; std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); + size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); + size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; + size_t currentSize = 0; - for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { - storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + size_t neededReserveSize = 0; + size_t startIndex = 0; + for (size_t i = 0; i < topologicalSort.size(); ++i) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; + size_t const currentSccSize = scc.size(); uint_fast64_t rowCount = 0; uint_fast64_t entryCount = 0; - storm::storage::StateBlock rowGroups; - rowGroups.reserve(scc.size()); for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { rowCount += matrix.getRowGroupSize(*sccIt); entryCount += matrix.getRowGroupEntryCount(*sccIt); - rowGroups.insert(*sccIt); } size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); if ((currentSize + sccSize) <= cudaFreeMemory) { // There is enough space left in the current group - - if (currentSize == 0) { - result.push_back(std::make_pair(true, rowGroups)); - } - else { - result[lastResultIndex].second.insert(rowGroups.begin(), rowGroups.end()); - } + neededReserveSize += currentSccSize; currentSize += sccSize; - } - else { - if (sccSize <= cudaFreeMemory) { + } else { + // This would make the last open group to big for the GPU + + if (startIndex < i) { + if ((startIndex + 1) < i) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + for (size_t j = startIndex; j < i; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); + } else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } ++lastResultIndex; - result.push_back(std::make_pair(true, rowGroups)); - currentSize = sccSize; } - else { + + if (sccSize <= cudaFreeMemory) { + currentSize = sccSize; + neededReserveSize = currentSccSize; + startIndex = i; + } else { // This group is too big to fit into the CUDA Memory by itself - lastResultIndex += 2; - result.push_back(std::make_pair(false, rowGroups)); + result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); + ++lastResultIndex; + currentSize = 0; + neededReserveSize = 0; + startIndex = i + 1; + } + } + } + + size_t const topologicalSortSize = topologicalSort.size(); + if (startIndex < topologicalSortSize) { + if ((startIndex + 1) < topologicalSortSize) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + for (size_t j = startIndex; j < topologicalSortSize; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); } + std::sort(tempGroups.begin(), tempGroups.end()); + + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } + else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; } #else for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - storm::storage::StateBlock rowGroups; - rowGroups.reserve(scc.size()); - for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { - rowGroups.insert(*sccIt); - } - result.push_back(std::make_pair(false, rowGroups)); + result.push_back(std::make_pair(false, scc)); } #endif return result; From a4a17de4fcee12c4797d08ccb9fcf3538b8ebd9c Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 20 Mar 2014 03:57:14 +0100 Subject: [PATCH 31/43] Added timing for PRCTL formula checking. Replaced std::sort with std::inplace_merge, saving another factor 2. Former-commit-id: 961c31bb68a779b8322b3f93f4ed27134833d6de --- ...onNondeterministicLinearEquationSolver.cpp | 20 +++++++++++++++---- src/storm.cpp | 12 +++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index fa68468ec..84507912e 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -251,11 +251,17 @@ namespace storm { // More than one component std::vector tempGroups; tempGroups.reserve(neededReserveSize); - for (size_t j = startIndex; j < i; ++j) { + + // Copy the first group to make inplace_merge possible + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + for (size_t j = startIndex + 1; j < i; ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + std::vector::iterator const middleIterator = tempGroups.end(); tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); } - std::sort(tempGroups.begin(), tempGroups.end()); result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } else { @@ -287,11 +293,17 @@ namespace storm { // More than one component std::vector tempGroups; tempGroups.reserve(neededReserveSize); - for (size_t j = startIndex; j < topologicalSortSize; ++j) { + + // Copy the first group to make inplace_merge possible + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + std::vector::iterator const middleIterator = tempGroups.end(); tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); } - std::sort(tempGroups.begin(), tempGroups.end()); result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } diff --git a/src/storm.cpp b/src/storm.cpp index cd1ec951c..c95fd1338 100644 --- a/src/storm.cpp +++ b/src/storm.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include "storm-config.h" #include "src/models/Dtmc.h" @@ -114,14 +115,14 @@ void printUsage() { ULARGE_INTEGER uLargeInteger; uLargeInteger.LowPart = ftKernel.dwLowDateTime; uLargeInteger.HighPart = ftKernel.dwHighDateTime; - double kernelTime = uLargeInteger.QuadPart / 10000.0; // 100 ns Resolution to milliseconds + double kernelTime = static_cast(uLargeInteger.QuadPart) / 10000.0; // 100 ns Resolution to milliseconds uLargeInteger.LowPart = ftUser.dwLowDateTime; uLargeInteger.HighPart = ftUser.dwHighDateTime; - double userTime = uLargeInteger.QuadPart / 10000.0; + double userTime = static_cast(uLargeInteger.QuadPart) / 10000.0; std::cout << "CPU Time: " << std::endl; - std::cout << "\tKernel Time: " << std::setprecision(3) << kernelTime << std::endl; - std::cout << "\tUser Time: " << std::setprecision(3) << userTime << std::endl; + std::cout << "\tKernel Time: " << std::setprecision(5) << kernelTime << std::endl; + std::cout << "\tUser Time: " << std::setprecision(5) << userTime << std::endl; #endif } @@ -315,8 +316,11 @@ void checkPrctlFormulae(storm::modelchecker::prctl::AbstractModelChecker std::list*> formulaList = storm::parser::PrctlFileParser(chosenPrctlFile); for (auto formula : formulaList) { + std::chrono::high_resolution_clock::time_point startTime = std::chrono::high_resolution_clock::now(); modelchecker.check(*formula); delete formula; + std::chrono::high_resolution_clock::time_point endTime = std::chrono::high_resolution_clock::now(); + std::cout << "Checking the formula took " << std::chrono::duration_cast(endTime - startTime).count() << "ms." << std::endl; } } } From 47b34171f237876d90434f026248b8b86d62cb3d Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 16 Apr 2014 18:46:31 +0200 Subject: [PATCH 32/43] Fixed a typo. Former-commit-id: b5a3026aa999adbbb6d9c268d4cabed79e39aedd --- src/utility/graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utility/graph.h b/src/utility/graph.h index f6fb06c3c..e9757d50e 100644 --- a/src/utility/graph.h +++ b/src/utility/graph.h @@ -49,7 +49,7 @@ namespace storm { // Prepare the resulting bit vector. storm::storage::BitVector statesWithProbabilityGreater0(model.getNumberOfStates()); - // Add all psi states as the already satisfy the condition. + // Add all psi states as they already satisfy the condition. statesWithProbabilityGreater0 |= psiStates; // Initialize the stack used for the DFS with the states. From d2f4c857114224452a17a41c3813cac4bcdf6da7 Mon Sep 17 00:00:00 2001 From: PBerger Date: Tue, 1 Jul 2014 01:35:06 +0200 Subject: [PATCH 33/43] Made changes to comply with new SparseMatrix Interface (YUCK). Fixed tests, all that stuff. Former-commit-id: c78de5f8cecb952d23d56713414c69fff0ddc570 --- .../srcCuda/basicValueIteration.cu | 10 +-- .../srcCuda/basicValueIteration.h | 80 ++++++++++++++++++- src/models/PseudoModel.cpp | 2 +- ...onNondeterministicLinearEquationSolver.cpp | 6 +- src/storage/SparseMatrix.cpp | 2 +- src/storage/SparseMatrix.h | 2 +- src/utility/graph.h | 6 +- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 29 ++----- ...glyConnectedComponentDecompositionTest.cpp | 4 +- 9 files changed, 98 insertions(+), 43 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index b7a879e8b..33ee6218c 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -57,7 +57,7 @@ void exploadVector(std::vector> const& inputVect } template -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { //std::vector matrixColumnIndices; //std::vector matrixValues; //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); @@ -305,7 +305,7 @@ cleanup: } template -void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { +void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; @@ -681,7 +681,7 @@ cleanup: * */ -void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { +void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); } @@ -705,7 +705,7 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector(x, y, maxElement); } -void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { if (relativePrecisionCheck) { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); } else { @@ -713,7 +713,7 @@ void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const max } } -void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { if (relativePrecisionCheck) { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); } else { diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index f23cbec28..1f1f88060 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -8,10 +8,84 @@ // Library exports #include "cudaForStorm_Export.h" +/* Helper declaration to cope with new internal format */ +#ifndef STORM_STORAGE_SPARSEMATRIX_H_ +namespace storm { + namespace storage { +template + class MatrixEntry { + public: + /*! + * Constructs a matrix entry with the given column and value. + * + * @param column The column of the matrix entry. + * @param value The value of the matrix entry. + */ + MatrixEntry(uint_fast64_t column, T value); + + /*! + * Move-constructs the matrix entry fro the given column-value pair. + * + * @param pair The column-value pair from which to move-construct the matrix entry. + */ + MatrixEntry(std::pair&& pair); + + //MatrixEntry() = default; + //MatrixEntry(MatrixEntry const& other) = default; + //MatrixEntry& operator=(MatrixEntry const& other) = default; +#ifndef WINDOWS + //MatrixEntry(MatrixEntry&& other) = default; + //MatrixEntry& operator=(MatrixEntry&& other) = default; +#endif + + /*! + * Retrieves the column of the matrix entry. + * + * @return The column of the matrix entry. + */ + uint_fast64_t const& getColumn() const; + + /*! + * Retrieves the column of the matrix entry. + * + * @return The column of the matrix entry. + */ + uint_fast64_t& getColumn(); + + /*! + * Retrieves the value of the matrix entry. + * + * @return The value of the matrix entry. + */ + T const& getValue() const; + + /*! + * Retrieves the value of the matrix entry. + * + * @return The value of the matrix entry. + */ + T& getValue(); + + /*! + * Retrieves a pair of column and value that characterizes this entry. + * + * @return A column-value pair that characterizes this entry. + */ + std::pair const& getColumnValuePair() const; + + private: + // The actual matrix entry. + std::pair entry; + }; + + } +} +#endif + cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); -cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); diff --git a/src/models/PseudoModel.cpp b/src/models/PseudoModel.cpp index 37749a873..7f26e4f36 100644 --- a/src/models/PseudoModel.cpp +++ b/src/models/PseudoModel.cpp @@ -80,7 +80,7 @@ namespace storm { boost::container::flat_set allTargetBlocks; for (auto state : block) { for (auto const& transitionEntry : this->getRows(state)) { - uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.first]; + uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.getColumn()]; // We only need to consider transitions that are actually leaving the SCC. if (targetBlock != currentBlockIndex) { diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 84507912e..dce426ec2 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -94,7 +94,7 @@ namespace storm { std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); sccSubNondeterministicChoiceIndices.at(0) = 0; - // Preprocess all dependant states + // Preprocess all dependent states // Remove outgoing transitions and create the ChoiceIndices uint_fast64_t innerIndex = 0; uint_fast64_t outerIndex = 0; @@ -105,10 +105,10 @@ namespace storm { for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->first)) { + if (!subMatrixIndices.get(rowIt->getColumn())) { // This is an outgoing transition of a state in the SCC to a state not included in the SCC // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->second * x.at(rowIt->first)); + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); } } ++innerIndex; diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index cf19ef98e..c300e6858 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -931,7 +931,7 @@ namespace storm { * Returns a reference to the internal columnMapping vector */ template - std::vector> const& SparseMatrix::__internal_getColumnsAndValues() { + std::vector> const& SparseMatrix::__internal_getColumnsAndValues() { return this->columnsAndValues; } diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index 277ed069f..a06fbeb60 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -732,7 +732,7 @@ namespace storm { /*! * Returns a reference to the internal columnMapping vector */ - std::vector> const& __internal_getColumnsAndValues(); + std::vector> const& __internal_getColumnsAndValues(); private: /*! * Creates a submatrix of the current matrix by keeping only row groups and columns in the given row group diff --git a/src/utility/graph.h b/src/utility/graph.h index 32f311304..5fac4d29a 100644 --- a/src/utility/graph.h +++ b/src/utility/graph.h @@ -598,12 +598,12 @@ namespace storm { recursionStepBackward: for (; successorIterator != matrix.end(currentState); ++successorIterator) { - if (!visitedStates.get(successorIterator->first)) { + if (!visitedStates.get(successorIterator->getColumn())) { // Put unvisited successor on top of our recursion stack and remember that. - recursionStack.push_back(successorIterator->first); + recursionStack.push_back(successorIterator->getColumn()); // Also, put initial value for iterator on corresponding recursion stack. - iteratorRecursionStack.push_back(matrix.begin(successorIterator->first)); + iteratorRecursionStack.push_back(matrix.begin(successorIterator->getColumn())); goto recursionStepForward; } diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 811b9532d..e125b09a2 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -10,13 +10,8 @@ #include "storm-config.h" TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { - storm::settings::Settings* s = storm::settings::Settings::getInstance(); - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); - //storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/mdp/scc/scc.lab", ""); - - ASSERT_EQ(parser.getType(), storm::models::MDP); - - std::shared_ptr> mdp = parser.getModel>(); + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as>(); ASSERT_EQ(mdp->getNumberOfStates(), 169ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); @@ -108,12 +103,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); #endif delete rewardFormula; - - storm::parser::AutoParser stateRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", ""); - - ASSERT_EQ(stateRewardParser.getType(), storm::models::MDP); - - std::shared_ptr> stateRewardMdp = stateRewardParser.getModel>(); + std::shared_ptr> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as>(); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp); @@ -142,12 +132,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); #endif delete rewardFormula; - - storm::parser::AutoParser stateAndTransitionRewardParser(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew"); - - ASSERT_EQ(stateAndTransitionRewardParser.getType(), storm::models::MDP); - - std::shared_ptr> stateAndTransitionRewardMdp = stateAndTransitionRewardParser.getModel>(); + std::shared_ptr> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as>(); storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); @@ -180,11 +165,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew"); - - ASSERT_EQ(parser.getType(), storm::models::MDP); - - std::shared_ptr> mdp = parser.getModel>(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew")->as>(); ASSERT_EQ(mdp->getNumberOfStates(), 3172ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull); diff --git a/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp b/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp index bc3585fc2..6f3d7876f 100644 --- a/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp +++ b/test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp @@ -1,6 +1,7 @@ #include "gtest/gtest.h" #include "storm-config.h" #include "src/parser/AutoParser.h" +#include "src/models/Dtmc.h" #include "src/storage/StronglyConnectedComponentDecomposition.h" #include "src/models/MarkovAutomaton.h" @@ -49,8 +50,7 @@ TEST(StronglyConnectedComponentDecomposition, FullSystem2) { } TEST(StronglyConnectedComponentDecomposition, MatrixBasedSystem) { - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.lab", "", ""); - std::shared_ptr> dtmc = parser.getModel>(); + std::shared_ptr> dtmc = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.tra", STORM_CPP_BASE_PATH "/examples/dtmc/scc/scc.lab", "", "")->as>(); storm::storage::StronglyConnectedComponentDecomposition sccDecomposition; ASSERT_NO_THROW(sccDecomposition = storm::storage::StronglyConnectedComponentDecomposition(*dtmc, true, false)); From 73ddba5b29485b03e48c91eb7a7f789ef3889e81 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sun, 10 Aug 2014 17:48:14 +0200 Subject: [PATCH 34/43] Merged master, applied fixes. Added feedback from the cuda plugin and return of iteration count. Former-commit-id: 711ca3d9ec8140c09a3ef7ce617bf99dee4572d9 --- .../srcCuda/basicValueIteration.cu | 47 +++++++++++++++---- .../srcCuda/basicValueIteration.h | 4 +- ...onNondeterministicLinearEquationSolver.cpp | 29 +++++++++--- src/storage/SparseMatrix.cpp | 4 +- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 12 +---- 5 files changed, 68 insertions(+), 28 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 33ee6218c..6e84d5e70 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -57,10 +57,11 @@ void exploadVector(std::vector> const& inputVect } template -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { //std::vector matrixColumnIndices; //std::vector matrixValues; //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + bool errorOccured = false; IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndicesAndValues = nullptr; @@ -85,12 +86,13 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaError_t cudaMallocResult; bool converged = false; - uint_fast64_t iterationCount = 0; + iterationCount = 0; CUDA_CHECK_ALL_ERRORS(); cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -98,6 +100,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -105,6 +108,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -112,6 +116,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_xSwap), sizeof(ValueType) * matrixColCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -119,6 +124,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -126,6 +132,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -133,6 +140,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; goto cleanup; } @@ -147,6 +155,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -154,6 +163,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -161,6 +171,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -169,6 +180,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -176,6 +188,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -184,6 +197,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -191,6 +205,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -232,6 +247,12 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Swap pointers, device_x always contains the most current result std::swap(device_x, device_xSwap); } + + if (!converged && (iterationCount == maxIterationCount)) { + iterationCount = 0; + errorOccured = true; + } + #ifdef DEBUG std::cout << "(DLL) Finished kernel execution." << std::endl; std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; @@ -241,6 +262,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; goto cleanup; } @@ -254,6 +276,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_matrixRowIndices = nullptr; } @@ -261,6 +284,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_matrixColIndicesAndValues = nullptr; } @@ -268,6 +292,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_x); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_x = nullptr; } @@ -275,6 +300,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_xSwap); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_xSwap = nullptr; } @@ -282,6 +308,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_b); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_b = nullptr; } @@ -289,6 +316,7 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_multiplyResult = nullptr; } @@ -296,12 +324,15 @@ cleanup: cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); if (cudaFreeResult != cudaSuccess) { std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; } device_nondeterministicChoiceIndices = nullptr; } #ifdef DEBUG std::cout << "(DLL) Finished cleanup." << std::endl; #endif + + return !errorOccured; } template @@ -705,19 +736,19 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector(x, y, maxElement); } -void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { if (relativePrecisionCheck) { - basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); } else { - basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); } } -void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { if (relativePrecisionCheck) { - basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); } else { - basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); } } diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 1f1f88060..728ba07f9 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -83,8 +83,8 @@ template #endif cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index dce426ec2..ee2a59bc7 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -64,9 +64,9 @@ namespace storm { std::vector* currentX = nullptr; std::vector* swap = nullptr; - uint_fast64_t currentMaxLocalIterations = 0; - uint_fast64_t localIterations = 0; - uint_fast64_t globalIterations = 0; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; bool converged = true; // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only @@ -128,13 +128,30 @@ namespace storm { LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + bool result = false; + localIterations = 0; if (minimize) { - basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); } else { - basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; } - converged = true; + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + #else LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index 4ffcbe991..c300e6858 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -787,8 +787,8 @@ namespace storm { const_iterator ite; std::vector::const_iterator rowIterator = this->rowIndications.begin() + startRow; std::vector::const_iterator rowIteratorEnd = this->rowIndications.begin() + endRow; - typename std::vector::iterator resultIterator = result.begin() + startRow; - typename std::vector::iterator resultIteratorEnd = result.begin() + endRow; + std::vector::iterator resultIterator = result.begin() + startRow; + std::vector::iterator resultIteratorEnd = result.begin() + endRow; for (; resultIterator != resultIteratorEnd; ++rowIterator, ++resultIterator) { *resultIterator = storm::utility::constantZero(); diff --git a/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index 2f0e35393..441c4a3bb 100644 --- a/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -8,11 +8,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { storm::settings::Settings* s = storm::settings::Settings::getInstance(); - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew"); - - ASSERT_EQ(parser.getType(), storm::models::MDP); - - std::shared_ptr> mdp = parser.getModel>(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew")->as>(); ASSERT_EQ(mdp->getNumberOfStates(), 2095783ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 7714385ull); @@ -79,11 +75,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Consensus) { // Increase the maximal number of iterations, because the solver does not converge otherwise. // This is done in the main cpp unit - storm::parser::AutoParser parser(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", ""); - - ASSERT_EQ(parser.getType(), storm::models::MDP); - - std::shared_ptr> mdp = parser.getModel>(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", "")->as>(); ASSERT_EQ(mdp->getNumberOfStates(), 63616ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 213472ull); From 7e77fbb6bbce600a004c7f622f90188cd711c1f4 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sun, 10 Aug 2014 18:13:28 +0200 Subject: [PATCH 35/43] Some testing stuff. Former-commit-id: d7a9085af51531ac5ee7a31b4f9d29e20d01d41a --- ...alueIterationNondeterministicLinearEquationSolver.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index ee2a59bc7..e9d0222ea 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -45,6 +45,9 @@ namespace storm { template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + // For testing only + LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<"); + // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); @@ -124,9 +127,9 @@ namespace storm { throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; } - LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); bool result = false; localIterations = 0; From 422a31740738c6cd6011965d542b98d0a99d4fce Mon Sep 17 00:00:00 2001 From: PBerger Date: Tue, 12 Aug 2014 23:12:12 +0200 Subject: [PATCH 36/43] Made the OptimalSCC algorithm MUCH faster. Fixed error reporting in AtomicPropositionLabelingParser.cpp and SparseStateRewardParser.cpp. Former-commit-id: 77ba352a295faff6ff23ef3f83cdf4c7ff285d54 --- .../AtomicPropositionLabelingParser.cpp | 23 ++++---- src/parser/SparseStateRewardParser.cpp | 17 +++--- ...onNondeterministicLinearEquationSolver.cpp | 56 +++++++++++++------ 3 files changed, 60 insertions(+), 36 deletions(-) diff --git a/src/parser/AtomicPropositionLabelingParser.cpp b/src/parser/AtomicPropositionLabelingParser.cpp index 0dac9a8c9..dffdd030a 100644 --- a/src/parser/AtomicPropositionLabelingParser.cpp +++ b/src/parser/AtomicPropositionLabelingParser.cpp @@ -29,8 +29,8 @@ namespace storm { // Open the given file. if (!MappedFile::fileExistsAndIsReadable(filename.c_str())) { - LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": File does not exist or is not readable."); - throw storm::exceptions::FileIoException() << "The supplied Labeling input file \"" << filename << "\" does not exist or is not readable by this process."; + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": The supplied Labeling input file does not exist or is not readable by this process."); + throw storm::exceptions::FileIoException() << "Error while parsing " << filename << ": The supplied Labeling input file does not exist or is not readable by this process."; } MappedFile file(filename.c_str()); @@ -68,10 +68,10 @@ namespace storm { // If #DECLARATION or #END have not been found, the file format is wrong. if (!(foundDecl && foundEnd)) { - LOG4CPLUS_ERROR(logger, "Wrong file format in (" << filename << "). File header is corrupted."); + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": File header is corrupted (#DECLARATION or #END missing - case sensitive)."); if (!foundDecl) LOG4CPLUS_ERROR(logger, "\tDid not find #DECLARATION token."); if (!foundEnd) LOG4CPLUS_ERROR(logger, "\tDid not find #END token."); - throw storm::exceptions::WrongFormatException(); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": File header is corrupted (#DECLARATION or #END missing - case sensitive)."; } @@ -100,8 +100,8 @@ namespace storm { if (cnt >= sizeof(proposition)) { // if token is longer than our buffer, the following strncpy code might get risky... - LOG4CPLUS_ERROR(logger, "Wrong file format in (" << filename << "). Atomic proposition with length > " << (sizeof(proposition)-1) << " was found."); - throw storm::exceptions::WrongFormatException(); + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": Atomic proposition with length > " << (sizeof(proposition) - 1) << " was found."); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": Atomic proposition with length > " << (sizeof(proposition) - 1) << " was found."; } else if (cnt > 0) { @@ -127,6 +127,7 @@ namespace storm { uint_fast64_t state = 0; uint_fast64_t lastState = (uint_fast64_t)-1; + uint_fast64_t const startIndexComparison = lastState; cnt = 0; // Now parse the assignments of labels to nodes. @@ -137,9 +138,9 @@ namespace storm { state = checked_strtol(buf, &buf); // If the state has already been read or skipped once there might be a problem with the file (doubled lines, or blocks). - if(state <= lastState && lastState != (uint_fast64_t)-1) { - LOG4CPLUS_ERROR(logger, "Wrong file format in (" << filename << "). State " << state << " was found but has already been read or skipped previously."); - throw storm::exceptions::WrongFormatException() << "State " << state << " was found but has already been read or skipped previously."; + if (state <= lastState && lastState != startIndexComparison) { + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": State " << state << " was found but has already been read or skipped previously."); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": State " << state << " was found but has already been read or skipped previously."; } while ((buf[0] != '\r') && (buf[0] != '\n') && (buf[0] != '\0')) { @@ -159,8 +160,8 @@ namespace storm { // Has the label been declared in the header? if(!labeling.containsAtomicProposition(proposition)) { - LOG4CPLUS_ERROR(logger, "Wrong file format in (" << filename << "). Atomic proposition" << proposition << " was found but not declared."); - throw storm::exceptions::WrongFormatException(); + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": Atomic proposition" << proposition << " was found but not declared."); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": Atomic proposition" << proposition << " was found but not declared."; } labeling.addAtomicPropositionToState(proposition, state); buf += cnt; diff --git a/src/parser/SparseStateRewardParser.cpp b/src/parser/SparseStateRewardParser.cpp index 679bb5e7e..e707dbe3b 100644 --- a/src/parser/SparseStateRewardParser.cpp +++ b/src/parser/SparseStateRewardParser.cpp @@ -4,7 +4,7 @@ * Created on: 23.12.2012 * Author: Christian Dehnert */ - +#include #include "src/parser/SparseStateRewardParser.h" #include "src/exceptions/WrongFormatException.h" @@ -37,6 +37,7 @@ namespace storm { // Now parse state reward assignments. uint_fast64_t state = 0; uint_fast64_t lastState = (uint_fast64_t)-1; + uint_fast64_t const startIndexComparison = lastState; double reward; // Iterate over states. @@ -47,21 +48,21 @@ namespace storm { // If the state has already been read or skipped once there might be a problem with the file (doubled lines, or blocks). // Note: The value -1 shows that lastState has not yet been set, i.e. this is the first run of the loop (state index (2^64)-1 is a really bad starting index). - if(state <= lastState && lastState != (uint_fast64_t)-1) { - LOG4CPLUS_ERROR(logger, "State " << state << " was found but has already been read or skipped previously."); - throw storm::exceptions::WrongFormatException() << "State " << state << " was found but has already been read or skipped previously."; + if (state <= lastState && lastState != startIndexComparison) { + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": State " << state << " was found but has already been read or skipped previously."); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": State " << state << " was found but has already been read or skipped previously."; } if(stateCount <= state) { - LOG4CPLUS_ERROR(logger, "Found reward for a state of an invalid index \"" << state << "\". The model has only " << stateCount << " states."); - throw storm::exceptions::OutOfRangeException() << "Found reward for a state of an invalid index \"" << state << "\""; + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": Found reward for a state of an invalid index \"" << state << "\". The model has only " << stateCount << " states."); + throw storm::exceptions::OutOfRangeException() << "Error while parsing " << filename << ": Found reward for a state of an invalid index \"" << state << "\""; } reward = checked_strtod(buf, &buf); if (reward < 0.0) { - LOG4CPLUS_ERROR(logger, "Expected positive reward value but got \"" << reward << "\"."); - throw storm::exceptions::WrongFormatException() << "State reward file specifies illegal reward value."; + LOG4CPLUS_ERROR(logger, "Error while parsing " << filename << ": Expected positive reward value but got \"" << reward << "\"."); + throw storm::exceptions::WrongFormatException() << "Error while parsing " << filename << ": State reward file specifies illegal reward value."; } stateRewards[state] = reward; diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index e9d0222ea..4955797ed 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -80,7 +80,7 @@ namespace storm { bool const useGpu = sccIndexIt->first; storm::storage::StateBlock const& scc = sccIndexIt->second; - // Generate a submatrix + // Generate a sub matrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); std::vector sccSubB(sccSubmatrix.getRowCount()); @@ -97,7 +97,7 @@ namespace storm { std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); sccSubNondeterministicChoiceIndices.at(0) = 0; - // Preprocess all dependent states + // Pre-process all dependent states // Remove outgoing transitions and create the ChoiceIndices uint_fast64_t innerIndex = 0; uint_fast64_t outerIndex = 0; @@ -135,8 +135,7 @@ namespace storm { localIterations = 0; if (minimize) { result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - else { + } else { result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); } LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); @@ -276,13 +275,24 @@ namespace storm { storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); - for (size_t j = startIndex + 1; j < i; ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - std::vector::iterator const middleIterator = tempGroups.end(); - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + if (((startIndex + 1) + 80) >= i) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < i; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); } - result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } else { // Only one group, copy construct. @@ -318,13 +328,25 @@ namespace storm { storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); - for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { - storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; - std::vector::iterator const middleIterator = tempGroups.end(); - tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); - std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); - } - + // For set counts <= 80, Inplace Merge is faster + if (((startIndex + 1) + 80) >= topologicalSortSize) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + } result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } else { From ea427fcde1fdf7a031446be2e2d4187dcbe0be6a Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 20 Aug 2014 23:46:14 +0200 Subject: [PATCH 37/43] Fixed include directories for CUDA Plugin in CMakeLists.txt Refactored all code related to the SPMV kernels to work with float. Wrote a test that determines whether the compiler uses 64bit boundary alignments on std::pairs of uint64 and float. Introduced functions that allow for conversions between different ValueTypes (e.g. from float to double and backwards). Former-commit-id: 830d24064fbe9166d80b8273c01f2a7f7ea09242 --- CMakeLists.txt | 2 +- .../cudaForStorm/CMakeFloatAlignmentCheck.cpp | 31 ++ resources/cudaForStorm/CMakeLists.txt | 18 + .../srcCuda/basicValueIteration.cu | 167 ++++++-- .../srcCuda/basicValueIteration.h | 12 + .../cudaForStorm/srcCuda/cuspExtension.h | 353 ++--------------- .../srcCuda/cuspExtensionDouble.h | 361 +++++++++++++++++ .../cudaForStorm/srcCuda/cuspExtensionFloat.h | 375 ++++++++++++++++++ .../cudaForStorm/storm-cudaplugin-config.h.in | 4 +- src/models/PseudoModel.cpp | 3 + ...veNondeterministicLinearEquationSolver.cpp | 3 +- ...onNondeterministicLinearEquationSolver.cpp | 198 +++++++++ src/storage/SparseMatrix.cpp | 4 + src/storage/SparseMatrix.h | 17 + ...tronglyConnectedComponentDecomposition.cpp | 1 + src/utility/vector.h | 14 + test/functional/solver/CudaPluginTest.cpp | 193 ++++++++- 17 files changed, 1388 insertions(+), 368 deletions(-) create mode 100644 resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp create mode 100644 resources/cudaForStorm/srcCuda/cuspExtensionDouble.h create mode 100644 resources/cudaForStorm/srcCuda/cuspExtensionFloat.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 072e50b68..cc2f18d67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,7 +320,7 @@ if (ENABLE_Z3) link_directories("${Z3_ROOT}/bin") endif() if (STORM_USE_CUDAFORSTORM) - link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib") + link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib") endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") diff --git a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp new file mode 100644 index 000000000..7b3b7a8b1 --- /dev/null +++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp @@ -0,0 +1,31 @@ +/* + * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries + */ + #include + #include + #include + + #define CONTAINER_SIZE 100ul + +int main(int argc, char* argv[]) { + int result = 0; + + std::vector> myVector; + for (size_t i = 0; i < CONTAINER_SIZE; ++i) { + myVector.push_back(std::make_pair(i, 42.12345f * i)); + } + + char* firstUintPointer = reinterpret_cast(&(myVector.at(0).first)); + char* secondUintPointer = reinterpret_cast(&(myVector.at(1).first)); + ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; + + if (uintDiff == (2 * sizeof(uint_fast64_t))) { + result = 2; + } else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { + result = 3; + } else { + result = -5; + } + + return result; + } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index 7bc37a097..d7d525386 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -131,6 +131,24 @@ else() message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") endif() +# Test for Float 64bit Alignment +try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT + ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp" + COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR +) +if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT) + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})") +endif() + + # # Make a version file containing the current version from git. # diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 6e84d5e70..6aa4a2fb4 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -10,20 +10,15 @@ #include "utility.h" #include "cuspExtension.h" + #include #include #include +#include "storm-cudaplugin-config.h" + #ifdef DEBUG -#define CUDA_CHECK_ALL_ERRORS() do { \ - cudaError_t errSync = cudaGetLastError(); \ - cudaError_t errAsync = cudaDeviceSynchronize(); \ - if (errSync != cudaSuccess) { \ - std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \ - } \ - if (errAsync != cudaSuccess) { \ - std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ - } } while(false) +#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) #else #define CUDA_CHECK_ALL_ERRORS() do {} while (false) #endif @@ -56,15 +51,16 @@ void exploadVector(std::vector> const& inputVect } } +// TEMPLATE VERSION template -bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { //std::vector matrixColumnIndices; //std::vector matrixValues; //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); bool errorOccured = false; IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -74,7 +70,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #ifdef DEBUG std::cout.sync_with_stdio(true); std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; - std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; #endif @@ -96,12 +92,27 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true +#else +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false +#endif + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } } CUDA_CHECK_ALL_ERRORS(); @@ -159,12 +170,23 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; + // Copy all data as floats are expanded to 64bits :/ + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } } CUDA_CHECK_ALL_ERRORS(); @@ -214,9 +236,8 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #endif // Data is on device, start Kernel - while (!converged && iterationCount < maxIterationCount) - { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -227,7 +248,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy CUDA_CHECK_ALL_ERRORS(); // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); // Check for convergence @@ -338,7 +359,7 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -362,12 +383,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } +#else CUDA_CHECK_ALL_ERRORS(); cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } +#endif CUDA_CHECK_ALL_ERRORS(); cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); @@ -397,12 +427,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); #ifdef DEBUG @@ -601,7 +640,7 @@ void basicValueIteration_reduceGroupedVector(std::vector const& group do { // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); + cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); CUDA_CHECK_ALL_ERRORS(); } while (false); @@ -713,7 +752,7 @@ cleanup: */ void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); } void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { @@ -736,6 +775,31 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector(x, y, maxElement); } +// Float +void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); +} + +void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { if (relativePrecisionCheck) { return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); @@ -752,6 +816,22 @@ bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max } } +bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { size_t const valueTypeSize = sizeof(double); size_t const indexTypeSize = sizeof(uint_fast64_t); @@ -772,5 +852,28 @@ size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t con // Vectors x, xSwap, b, multiplyResult size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + return (matrixDataSize + vectorSizes); +} + +size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(float); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + return (matrixDataSize + vectorSizes); } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 728ba07f9..09b4be5ca 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -85,6 +85,11 @@ template cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); @@ -92,4 +97,11 @@ cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_m cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement); + #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 7c9a9fb28..11c673bf9 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -1,338 +1,47 @@ -/* - * This is an extension of the original CUSP csr_vector.h SPMV implementation. - * It is based on the Code and incorporates changes as to cope with the details - * of the StoRM code. - * As this is mostly copy & paste, the original license still applies. - */ - -/* - * Copyright 2008-2009 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #pragma once -#include -#include -#include - -namespace cusp -{ -namespace detail -{ -namespace device -{ - -////////////////////////////////////////////////////////////////////////////// -// CSR SpMV kernels based on a vector model (one warp per row) -////////////////////////////////////////////////////////////////////////////// -// -// spmv_csr_vector_device -// Each row of the CSR matrix is assigned to a warp. The warp computes -// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with -// the x vector, in parallel. This division of work implies that -// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous -// manner (but generally not aligned). On GT200 these accesses are -// coalesced, unlike kernels based on the one-row-per-thread division of -// work. Since an entire 32-thread warp is assigned to each row, many -// threads will remain idle when their row contains a small number -// of elements. This code relies on implicit synchronization among -// threads in a warp. -// -// spmv_csr_vector_tex_device -// Same as spmv_csr_vector_tex_device, except that the texture cache is -// used for accessing the x vector. -// -// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] - - -template -__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) -__global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) -{ - __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals - __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; - - const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; - - const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector - const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index - const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block - const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(IndexType row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; - - const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local sum - ValueType sum = 0; - - if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - - // accumulate local sums - for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - else - { - // accumulate local sums - for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - - // store local sum in shared memory - sdata[threadIdx.x] = sum; - - // reduce local sums to row sum - if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; - if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; - if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; - if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; - if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; - - // first thread writes the result - if (thread_lane == 0) - y[row] = sdata[threadIdx.x]; - } -} - -template -__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) -__global__ void -storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer) -{ - __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals - __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2]; - - const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; - - const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector - const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index - const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block - const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(IndexType row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; - - const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local Min/Max - ValueType localMinMaxElement = minMaxInitializer; - - if (THREADS_PER_ROW == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } +#include "cuspExtensionFloat.h" +#include "cuspExtensionDouble.h" - // accumulate local sums - for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - else - { - // accumulate local sums - for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } +namespace cusp { +namespace detail { +namespace device { - // store local sum in shared memory - sdata[threadIdx.x] = localMinMaxElement; - - // reduce local min/max to row min/max - if (Minimize) { - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); - } else { - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); - } - - // first thread writes the result - if (thread_lane == 0) - x[row] = sdata[threadIdx.x]; - } +template +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { + // + throw; } - -template -void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) -{ - ValueType __minMaxInitializer = -std::numeric_limits::max(); - if (Minimize) { - __minMaxInitializer = std::numeric_limits::max(); - } - const ValueType minMaxInitializer = __minMaxInitializer; - - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - storm_cuda_opt_vector_reduce_kernel <<>> - (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); -} - -template -void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) -{ - const IndexType rows_per_group = num_entries / num_rows; - - if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - - __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { + storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } - -template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); - - if (UseCache) - unbind_x(x); +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { + storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } -template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +template +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { + // + throw; } - -template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } - -// NON-OPT - -template -void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); - - if (UseCache) - unbind_x(x); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } -template -void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } - -template -void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } } // end namespace device diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h new file mode 100644 index 000000000..eee59b007 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -0,0 +1,361 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & paste, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y) +{ + __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + double sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer) +{ + __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + double localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + double __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const double minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_double <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_double <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h new file mode 100644 index 000000000..bb9acf78e --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h @@ -0,0 +1,375 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & paste, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +#include "storm-cudaplugin-config.h" + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) +{ + __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + float sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) +{ + __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + float localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + float __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const float minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_float <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_float <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in index 1cfc9119e..3703d0c81 100644 --- a/resources/cudaForStorm/storm-cudaplugin-config.h.in +++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in @@ -2,7 +2,6 @@ * StoRM - Build-in Options * * This file is parsed by CMake during makefile generation - * It contains information such as the base path to the test/example data */ #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ @@ -16,4 +15,7 @@ #define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on #define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else +// Whether the size of float in a pair is expanded to 64bit +#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ diff --git a/src/models/PseudoModel.cpp b/src/models/PseudoModel.cpp index 7f26e4f36..af446778c 100644 --- a/src/models/PseudoModel.cpp +++ b/src/models/PseudoModel.cpp @@ -101,6 +101,9 @@ namespace storm { template class ModelBasedPseudoModel; template class NonDeterministicMatrixBasedPseudoModel; template class DeterministicMatrixBasedPseudoModel; + template class ModelBasedPseudoModel ; + template class NonDeterministicMatrixBasedPseudoModel ; + template class DeterministicMatrixBasedPseudoModel ; template class ModelBasedPseudoModel; template class NonDeterministicMatrixBasedPseudoModel; template class DeterministicMatrixBasedPseudoModel; diff --git a/src/solver/NativeNondeterministicLinearEquationSolver.cpp b/src/solver/NativeNondeterministicLinearEquationSolver.cpp index 7f88258f1..3fd213ad9 100644 --- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp +++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp @@ -76,7 +76,7 @@ namespace storm { } // Determine whether the method converged. - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); // Update environment variables. std::swap(currentX, newX); @@ -140,5 +140,6 @@ namespace storm { // Explicitly instantiate the solver. template class NativeNondeterministicLinearEquationSolver; + template class NativeNondeterministicLinearEquationSolver; } // namespace solver } // namespace storm diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 4955797ed..8a09d9e7d 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -42,9 +42,206 @@ namespace storm { NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); } + + template<> + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + // For testing only + LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker WITH FLOAT! <<<"); + + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } + + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + int counter = 0; + + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } + } + ++innerIndex; + } + ++outerIndex; + } + + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } + + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + + bool result = false; + localIterations = 0; + if (minimize) { + result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + +#else + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + + // Update environment variables. + std::swap(currentX, swap); + + ++localIterations; + ++globalIterations; + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + } + + + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; + } + + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + } + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + } + } template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + +#ifndef GPU_USE_DOUBLE + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); + + storm::storage::SparseMatrix new_A = A.toValueType(); + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); + + tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); + + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); + } +#else + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; // For testing only LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<"); @@ -224,6 +421,7 @@ namespace storm { else { LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } +#endif } template diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index c300e6858..ff7d71db5 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -1000,6 +1000,10 @@ namespace storm { template class SparseMatrix; template std::ostream& operator<<(std::ostream& out, SparseMatrix const& matrix); + template class MatrixEntry < float > ; + template class SparseMatrixBuilder < float >; + template class SparseMatrix < float >; + template std::ostream& operator<<(std::ostream& out, SparseMatrix const& matrix); } // namespace storage } // namespace storm diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index a06fbeb60..2a3dcc7c1 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -733,6 +733,23 @@ namespace storm { * Returns a reference to the internal columnMapping vector */ std::vector> const& __internal_getColumnsAndValues(); + + /*! + * Returns a copy of the matrix with the chosen internal data type + */ + template + SparseMatrix toValueType() const { + std::vector> new_columnsAndValues; + std::vector new_rowIndications(rowIndications); + std::vector new_rowGroupIndices(rowGroupIndices); + + new_columnsAndValues.resize(columnsAndValues.size()); + for (size_t i = 0, size = columnsAndValues.size(); i < size; ++i) { + new_columnsAndValues.at(i) = MatrixEntry(columnsAndValues.at(i).getColumn(), static_cast(columnsAndValues.at(i).getValue())); + } + + return SparseMatrix(columnCount, std::move(new_rowIndications), std::move(new_columnsAndValues), std::move(new_rowGroupIndices)); + } private: /*! * Creates a submatrix of the current matrix by keeping only row groups and columns in the given row group diff --git a/src/storage/StronglyConnectedComponentDecomposition.cpp b/src/storage/StronglyConnectedComponentDecomposition.cpp index 454ba9e00..7c584a312 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.cpp +++ b/src/storage/StronglyConnectedComponentDecomposition.cpp @@ -223,5 +223,6 @@ namespace storm { // Explicitly instantiate the SCC decomposition. template class StronglyConnectedComponentDecomposition; + template class StronglyConnectedComponentDecomposition; } // namespace storage } // namespace storm \ No newline at end of file diff --git a/src/utility/vector.h b/src/utility/vector.h index 79fd28046..1547903fa 100644 --- a/src/utility/vector.h +++ b/src/utility/vector.h @@ -422,6 +422,20 @@ namespace storm { return subVector; } + + /*! + * Converts the given vector to the given ValueType + */ + template + std::vector toValueType(std::vector const& oldVector) { + std::vector resultVector; + resultVector.resize(oldVector.size()); + for (size_t i = 0, size = oldVector.size(); i < size; ++i) { + resultVector.at(i) = static_cast(oldVector.at(i)); + } + + return resultVector; + } } // namespace vector } // namespace utility } // namespace storm diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp index 3d2a69c84..da52f3e2b 100644 --- a/test/functional/solver/CudaPluginTest.cpp +++ b/test/functional/solver/CudaPluginTest.cpp @@ -41,6 +41,38 @@ TEST(CudaPlugin, SpMV_4x4) { ASSERT_EQ(b.at(3), 0); } +TEST(CudaPlugin, SpMV_4x4_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f)); + + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(10, matrix.getEntryCount()); + + std::vector x({ 0.f, 4.f, 1.f, 1.f }); + std::vector b({ 0.f, 0.f, 0.f, 0.f }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 3); + ASSERT_EQ(b.at(1), 25); + ASSERT_EQ(b.at(2), 16); + ASSERT_EQ(b.at(3), 0); +} + TEST(CudaPlugin, SpMV_VerySmall) { storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); @@ -62,6 +94,27 @@ TEST(CudaPlugin, SpMV_VerySmall) { ASSERT_EQ(b.at(1), 16.0); } +TEST(CudaPlugin, SpMV_VerySmall_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(2, matrix.getRowCount()); + ASSERT_EQ(2, matrix.getColumnCount()); + ASSERT_EQ(2, matrix.getEntryCount()); + + std::vector x({ 4.0, 8.0 }); + std::vector b({ 0.0, 0.0 }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 4.0); + ASSERT_EQ(b.at(1), 16.0); +} + TEST(CudaPlugin, AddVectorsInplace) { std::vector vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; std::vector vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; @@ -93,6 +146,37 @@ TEST(CudaPlugin, AddVectorsInplace) { } } +TEST(CudaPlugin, AddVectorsInplace_float) { + std::vector vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + float cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + float cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + TEST(CudaPlugin, ReduceGroupedVector) { std::vector groupedVector = { 0.0, -1000.0, 0.000004, // Group 0 @@ -138,15 +222,60 @@ TEST(CudaPlugin, ReduceGroupedVector) { } } +TEST(CudaPlugin, ReduceGroupedVector_float) { + std::vector groupedVector = { + 0.0f, -1000.0f, 0.000004f, // Group 0 + 5.0f, // Group 1 + 0.0f, 1.0f, 2.0f, 3.0f, // Group 2 + -1000.0f, -3.14f, -0.0002f,// Group 3 (neg only) + 25.25f, 25.25f, 25.25f, // Group 4 + 0.0f, 0.0f, 1.0f, // Group 5 + -0.000001f, 0.000001f // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0f, // Group 0 + 5.0f, + 0.0f, + -1000.0f, + 25.25f, + 0.0f, + -0.000001f + }; + std::vector result_maximize = { + 0.000004f, + 5.0f, + 3.0f, + -0.0002f, + 25.25f, + 1.0f, + 0.000001f + }; + + std::vector result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + TEST(CudaPlugin, equalModuloPrecision) { std::vector x = { - 123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L + 123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12 }; std::vector y1 = { - 0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L + 0.45, 0.8, 0.23, 0.012, 0.456789, -0.12 }; std::vector y2 = { - 0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L + 0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12 }; std::vector x2; std::vector x3; @@ -163,21 +292,63 @@ TEST(CudaPlugin, equalModuloPrecision) { y4.push_back(1.0); } - double maxElement1 = 0.0L; - double maxElement2 = 0.0L; - double maxElement3 = 0.0L; - double maxElement4 = 0.0L; + double maxElement1 = 0.0; + double maxElement2 = 0.0; + double maxElement3 = 0.0; + double maxElement4 = 0.0; ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); - ASSERT_DOUBLE_EQ(4567890.0L, maxElement1); - ASSERT_DOUBLE_EQ(901.0L, maxElement2); + ASSERT_DOUBLE_EQ(4567890.0, maxElement1); + ASSERT_DOUBLE_EQ(901.0, maxElement2); + + ASSERT_DOUBLE_EQ(998.0, maxElement3); + ASSERT_DOUBLE_EQ(1001.0, maxElement4); +} + +TEST(CudaPlugin, equalModuloPrecision_float) { + std::vector x = { + 123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f + }; + std::vector y1 = { + 0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f + }; + std::vector y2 = { + 0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0f); + x3.push_back(-(1000.0f - static_cast(i))); + y4.push_back(1.0f); + } + + float maxElement1 = 0.0f; + float maxElement2 = 0.0f; + float maxElement3 = 0.0f; + float maxElement4 = 0.0f; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0f, maxElement1); + ASSERT_DOUBLE_EQ(901.0f, maxElement2); - ASSERT_DOUBLE_EQ(998.0L, maxElement3); - ASSERT_DOUBLE_EQ(1001.0L, maxElement4); + ASSERT_DOUBLE_EQ(998.0f, maxElement3); + ASSERT_DOUBLE_EQ(1001.0f, maxElement4); } #endif \ No newline at end of file From 71320239f1d5621133e0115ce402dbc7a7cc9bd7 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 21 Aug 2014 14:27:00 +0200 Subject: [PATCH 38/43] Some debug output. Former-commit-id: dd0a60f09916935a23f2119db225103c36b6f8e9 --- ...ValueIterationNondeterministicLinearEquationSolver.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 8a09d9e7d..c3e543433 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -46,7 +46,8 @@ namespace storm { template<> void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { // For testing only - LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker WITH FLOAT! <<<"); + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); @@ -228,7 +229,6 @@ namespace storm { void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { #ifndef GPU_USE_DOUBLE - std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); storm::storage::SparseMatrix new_A = A.toValueType(); @@ -241,9 +241,9 @@ namespace storm { x.at(i) = new_x.at(i); } #else - std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; // For testing only - LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<"); + std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); From 493f93a94bb11b99a830223a192e10928bff44a9 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sat, 30 Aug 2014 03:32:55 +0200 Subject: [PATCH 39/43] Added __restrict__ keyword to CUDA kernel. This should enhance compiler optimization. Refactored TopologicalValueIterationNondeterministicLinearEquationSolver to support "down-casting" to float. Added better timing output. Former-commit-id: 688c40decb5150d1ff6e042cfbebe3f0b2bca12e --- .../srcCuda/cuspExtensionDouble.h | 4 +- ...onNondeterministicLinearEquationSolver.cpp | 566 +++++++----------- ...tionNondeterministicLinearEquationSolver.h | 49 ++ 3 files changed, 283 insertions(+), 336 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h index eee59b007..65eb47bdd 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -65,7 +65,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y) +storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) { __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; @@ -135,7 +135,7 @@ storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const template __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) __global__ void -storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer) +storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) { __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index c3e543433..65451d743 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,6 +1,7 @@ #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include +#include #include "src/settings/Settings.h" #include "src/utility/vector.h" @@ -42,386 +43,283 @@ namespace storm { NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); } + + template + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + +#ifdef GPU_USE_FLOAT +#define __FORCE_FLOAT_CALCULATION true +#else +#define __FORCE_FLOAT_CALCULATION false +#endif + if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) { + TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); - template<> - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - // For testing only - std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; - LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); + storm::storage::SparseMatrix new_A = A.toValueType(); + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); - // Now, we need to determine the SCCs of the MDP and perform a topological sort. - std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); - storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); - - if (sccDecomposition.size() == 0) { - LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); - throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; - } + tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Calculate the optimal distribution of sccs - std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); - LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); - - std::vector* currentX = nullptr; - std::vector* swap = nullptr; - size_t currentMaxLocalIterations = 0; - size_t localIterations = 0; - size_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - int counter = 0; - - for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { - bool const useGpu = sccIndexIt->first; - storm::storage::StateBlock const& scc = sccIndexIt->second; - - // Generate a sub matrix - storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); - std::vector sccSubB(sccSubmatrix.getRowCount()); - storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); - std::vector sccSubX(sccSubmatrix.getColumnCount()); - std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); - - // Prepare the pointers for swapping in the calculation - currentX = &sccSubX; - swap = &sccSubXSwap; - - storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) - std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); - sccSubNondeterministicChoiceIndices.at(0) = 0; - - // Pre-process all dependent states - // Remove outgoing transitions and create the ChoiceIndices - uint_fast64_t innerIndex = 0; - uint_fast64_t outerIndex = 0; - for (uint_fast64_t state : scc) { - // Choice Indices - sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); - - for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); - for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->getColumn())) { - // This is an outgoing transition of a state in the SCC to a state not included in the SCC - // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); - } - } - ++innerIndex; - } - ++outerIndex; + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); } + return; + } + + // For testing only + if (sizeof(ValueType) == sizeof(double)) { + std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); + } else { + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); + } - // For the current SCC, we need to perform value iteration until convergence. - if (useGpu) { + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); + storm::models::NonDeterministicMatrixBasedPseudoModel const pseudoModel(A, nondeterministicChoiceIndices); + + // Check if the decomposition is necessary #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } - - //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - - bool result = false; - localIterations = 0; - if (minimize) { - result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } else { - result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - if (!result) { - converged = false; - LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - +#define __USE_CUDAFORSTORM_OPT true + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A.getEntryCount())); + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); #else - LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#define __USE_CUDAFORSTORM_OPT false + size_t const gpuSizeOfCompleteSystem = 0; + size_t const cudaFreeMemory = 0; #endif - } else { - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); - - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + std::vector> sccDecomposition; + if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { + // Dummy output for SCC Times + std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - - // Update environment variables. - std::swap(currentX, swap); - - ++localIterations; - ++globalIterations; - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; } - - // The Result of this SCC has to be taken back into the main result vector - innerIndex = 0; - for (uint_fast64_t state : scc) { - x.at(state) = currentX->at(innerIndex); - ++innerIndex; + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + bool result = false; + size_t globalIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); } + LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - // Since the pointers for swapping in the calculation point to temps they should not be valid anymore - currentX = nullptr; - swap = nullptr; - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; + bool converged = false; + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; } - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); - } - } - - template - void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - -#ifndef GPU_USE_DOUBLE - TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); - storm::storage::SparseMatrix new_A = A.toValueType(); - std::vector new_x = storm::utility::vector::toValueType(x); - std::vector const new_b = storm::utility::vector::toValueType(b); + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; - tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl; - for (size_t i = 0, size = new_x.size(); i < size; ++i) { - x.at(i) = new_x.at(i); - } + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations."); + } #else - // For testing only - std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; - LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); - - // Now, we need to determine the SCCs of the MDP and perform a topological sort. - std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); - storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); - storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now(); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); - if (sccDecomposition.size() == 0) { - LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); - throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; - } + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Calculate the optimal distribution of sccs - std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); - LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); - - std::vector* currentX = nullptr; - std::vector* swap = nullptr; - size_t currentMaxLocalIterations = 0; - size_t localIterations = 0; - size_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - int counter = 0; - - for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { - bool const useGpu = sccIndexIt->first; - storm::storage::StateBlock const& scc = sccIndexIt->second; - - // Generate a sub matrix - storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); - storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); - std::vector sccSubB(sccSubmatrix.getRowCount()); - storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); - std::vector sccSubX(sccSubmatrix.getColumnCount()); - std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); - std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); - - // Prepare the pointers for swapping in the calculation - currentX = &sccSubX; - swap = &sccSubXSwap; - - storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) - std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); - sccSubNondeterministicChoiceIndices.at(0) = 0; - - // Pre-process all dependent states - // Remove outgoing transitions and create the ChoiceIndices - uint_fast64_t innerIndex = 0; - uint_fast64_t outerIndex = 0; - for (uint_fast64_t state: scc) { - // Choice Indices - sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); - - for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { - typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); - for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { - if (!subMatrixIndices.get(rowIt->getColumn())) { - // This is an outgoing transition of a state in the SCC to a state not included in the SCC - // Subtracting Pr(tau) * x_other from b fixes that - sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast(sccEndTime - sccStartTime).count() << "ms." << std::endl; + + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + int counter = 0; + + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } } + ++innerIndex; } - ++innerIndex; + ++outerIndex; } - ++outerIndex; - } - // For the current SCC, we need to perform value iteration until convergence. - if (useGpu) { + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { #ifdef STORM_HAVE_CUDAFORSTORM - if (!resetCudaDevice()) { - LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); - throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; - } + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } - //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); - //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); - //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); - bool result = false; - localIterations = 0; - if (minimize) { - result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } else { - result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); - } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - - if (!result) { - converged = false; - LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); - throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; - } else { - converged = true; - } + bool result = false; + localIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + globalIterations += localIterations; #else - LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); - throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; #endif - } else { - localIterations = 0; - converged = false; - while (!converged && localIterations < this->maximalNumberOfIterations) { - // Compute x' = A*x + b. - sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); - storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + } else { + std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } - //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); - //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); - /* - Versus: - A.multiplyWithVector(*currentX, *multiplyResult); - storm::utility::vector::addVectorsInPlace(*multiplyResult, b); - */ + // Update environment variables. + std::swap(currentX, swap); - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (minimize) { - storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + ++localIterations; + ++globalIterations; } - else { - storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + } - // Update environment variables. - std::swap(currentX, swap); - ++localIterations; - ++globalIterations; + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; } - LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); - } + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; - // The Result of this SCC has to be taken back into the main result vector - innerIndex = 0; - for (uint_fast64_t state: scc) { - x.at(state) = currentX->at(innerIndex); - ++innerIndex; + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } } - // Since the pointers for swapping in the calculation point to temps they should not be valid anymore - currentX = nullptr; - swap = nullptr; + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } - } - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } - else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; } -#endif } template diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 40d9df354..bdbf788cd 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -8,6 +8,11 @@ #include #include +#include "storm-config.h" +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif + namespace storm { namespace solver { @@ -42,6 +47,50 @@ namespace storm { */ std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; }; + + template + bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + + template + bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } } // namespace solver } // namespace storm From c8e05f71374b48d0b51a7e5a11c70814fc256c03 Mon Sep 17 00:00:00 2001 From: PBerger Date: Sat, 30 Aug 2014 13:09:14 +0200 Subject: [PATCH 40/43] Added explicit template instance. Former-commit-id: 1ffcf0e47f37837cd5e89e82e1fbc1d6b5708f32 --- ...logicalValueIterationNondeterministicLinearEquationSolver.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 65451d743..c8fbf9a4e 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -462,5 +462,6 @@ namespace storm { // Explicitly instantiate the solver. template class TopologicalValueIterationNondeterministicLinearEquationSolver; + template class TopologicalValueIterationNondeterministicLinearEquationSolver; } // namespace solver } // namespace storm From 72af8c72463b5052062f9511e5312e7d4e1bfcc2 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 4 Sep 2014 23:48:10 +0200 Subject: [PATCH 41/43] Added missing (but implicitly declared) template instance. Former-commit-id: 23a897d83edf57600c22368b9bef7068fa55667c --- src/storage/StronglyConnectedComponentDecomposition.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/storage/StronglyConnectedComponentDecomposition.cpp b/src/storage/StronglyConnectedComponentDecomposition.cpp index f243f59bf..a09f04717 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.cpp +++ b/src/storage/StronglyConnectedComponentDecomposition.cpp @@ -224,5 +224,6 @@ namespace storm { // Explicitly instantiate the SCC decomposition. template class StronglyConnectedComponentDecomposition; + template class StronglyConnectedComponentDecomposition; } // namespace storage } // namespace storm \ No newline at end of file From ed3df5f1558d98a0bd64044cb9027235f5d7b315 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 11 Sep 2014 19:45:07 +0200 Subject: [PATCH 42/43] Last push :) Former-commit-id: 72c4b69cb24549ae6899879438497a1c078c0760 --- resources/cudaForStorm/srcCuda/cuspExtensionDouble.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h index 65eb47bdd..901df0ae7 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -2,7 +2,7 @@ * This is an extension of the original CUSP csr_vector.h SPMV implementation. * It is based on the Code and incorporates changes as to cope with the details * of the StoRM code. - * As this is mostly copy & paste, the original license still applies. + * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm */ /* From 5299ed5172004fc582d28464b2d52a85971f4c5b Mon Sep 17 00:00:00 2001 From: David_Korzeniewski Date: Thu, 4 Dec 2014 19:21:23 +0100 Subject: [PATCH 43/43] Adapted FindCusp to fail silently if cusp is not found. Now configuring fails with a meaningful error message instead of syntax errors. Former-commit-id: e77388a186b4c071d2c1aeedd707b2ee25fb9294 --- resources/cmake/FindCusp.cmake | 51 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/resources/cmake/FindCusp.cmake b/resources/cmake/FindCusp.cmake index 9520d1426..bda100911 100644 --- a/resources/cmake/FindCusp.cmake +++ b/resources/cmake/FindCusp.cmake @@ -23,33 +23,34 @@ find_path(CUSP_INCLUDE_DIR ) if(CUSP_INCLUDE_DIR) list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR) -endif(CUSP_INCLUDE_DIR) + + # Find cusp version + file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h + version + REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)" + ) + string(REGEX REPLACE + "#define CUSP_VERSION[ \t]+" + "" + version + "${version}" + ) -# Find cusp version -file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h - version - REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)" -) -string(REGEX REPLACE - "#define CUSP_VERSION[ \t]+" - "" - version - "${version}" -) - -#define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000) -#define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000) -#define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100) + #define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000) + #define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000) + #define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100) -math(EXPR CUSP_MAJOR_VERSION "${version} / 100000") -math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000") -math(EXPR CUSP_PATCH_VERSION "${version} % 100") + math(EXPR CUSP_MAJOR_VERSION "${version} / 100000") + math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000") + math(EXPR CUSP_PATCH_VERSION "${version} % 100") -set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}") + set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}") -# Check for required components -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION) + # Check for required components + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION) -set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR}) -mark_as_advanced(CUSP_INCLUDE_DIR) \ No newline at end of file + set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR}) + mark_as_advanced(CUSP_INCLUDE_DIR) + +endif(CUSP_INCLUDE_DIR) \ No newline at end of file