Fixed a bug in the equalModuloPrecision function of the CUDA Kernel

Added more debug output to the CUDA handler functions Added a function for grouping of SCCs for better performance Added functionality and accessors to the SparseMatrix Former-commit-id: 770aec1b09
11 years ago · 05814f5d73
6 changed files with 257 additions and 93 deletions
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -35,7 +35,7 @@ __host__ __device__ T operator()(const T &x, const T &y) const
 {
    if (Relative) {
 		if (y == 0) {
-			return x;
+			return ((x >= 0) ? (x) : (-x));
 		}
 		const T result = (x - y) / y;
 		return ((result >= 0) ? (result) : (-result));
@ -73,6 +73,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy

 #ifdef DEBUG
 	std::cout.sync_with_stdio(true);
+	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
 	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
 	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
 	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
@ -143,6 +144,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished allocating memory." << std::endl;
+#endif
+
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;

@ -204,6 +209,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
+#endif
+
 	// Data is on device, start Kernel
 	while (!converged && iterationCount < maxIterationCount)
 	{ // In a sub-area since transfer of control via label evades initialization
@ -239,6 +248,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		std::swap(device_x, device_xSwap);
 	}
 #ifdef DEBUG
+	std::cout << "(DLL) Finished kernel execution." << std::endl;
 	std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
 #endif

@ -249,6 +259,10 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying result data." << std::endl;
+#endif
+
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
@ -307,6 +321,9 @@ cleanup:
 		}
 		device_nondeterministicChoiceIndices = nullptr;
 	}
+#ifdef DEBUG
+	std::cout << "(DLL) Finished cleanup." << std::endl;
+#endif
 }

 template <typename IndexType, typename ValueType>
@ -323,6 +340,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In

 #ifdef DEBUG
 	std::cout.sync_with_stdio(true);
+	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
 	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
 	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
 	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
@ -368,6 +386,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished allocating memory." << std::endl;
+#endif
+
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;

@ -407,9 +429,17 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
+#endif
+
 	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
 	CUDA_CHECK_ALL_ERRORS();

+#ifdef DEBUG
+	std::cout << "(DLL) Finished kernel execution." << std::endl;
+#endif
+
 	// Get result back from the device
 	cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
 	if (cudaCopyResult != cudaSuccess) {
@ -417,6 +447,10 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying result data." << std::endl;
+#endif
+
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
@ -454,6 +488,9 @@ cleanup:
 		}
 		device_multiplyResult = nullptr;
 	}
+#ifdef DEBUG
+	std::cout << "(DLL) Finished cleanup." << std::endl;
+#endif
 }

 template <typename ValueType>
@ -730,4 +767,27 @@ void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max
 	} else {
 		basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
 	}
+}
+
+size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
+	size_t const valueTypeSize = sizeof(double);
+	size_t const indexTypeSize = sizeof(uint_fast64_t);
+
+	/*
+	IndexType* device_matrixRowIndices = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;
+	*/
+
+	// Row Indices, Column Indices, Values, Choice Indices
+	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
+	// Vectors x, xSwap, b, multiplyResult
+	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
+
+	return (matrixDataSize + vectorSizes);
 }
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -8,6 +8,7 @@
 // Library exports
 #include "cudaForStorm_Export.h"

+cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
 cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -42,12 +42,12 @@ namespace storm {
        }
        
        template<typename ValueType>
-		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
+		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
            
 			// Now, we need to determine the SCCs of the MDP and a topological sort.
 			//std::vector<std::vector<uint_fast64_t>> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph);
 			//storm::storage::SparseMatrix<T> stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents);
-
+			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
 			storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
 			//storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(*static_cast<storm::models::AbstractPseudoModel<ValueType>*>(&pseudoModel), false, false);
 			storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
@ -60,6 +60,9 @@ namespace storm {
 			storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
 			std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);

+			// Calculate the optimal distribution of sccs
+			std::vector<std::pair<bool, std::vector<uint_fast64_t>>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
+
 			// Set up the environment for the power method.
 //			bool multiplyResultMemoryProvided = true;
 //			if (multiplyResult == nullptr) {
@ -82,12 +85,13 @@ namespace storm {
 			// solved after all SCCs it depends on have been solved.
 			int counter = 0;

-			for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) {
-				storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+			for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
+				bool const useGpu = sccIndexIt->first;
+				std::vector <uint_fast64_t> const& scc = sccIndexIt->second;

 				// Generate a submatrix
 				storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
-				storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(subMatrixIndices, nondeterministicChoiceIndices);
+				storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
 				std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
 				storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
 				std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
@ -125,108 +129,115 @@ namespace storm {
 				}

 				// For the current SCC, we need to perform value iteration until convergence.
+				if (useGpu) {
 #ifdef STORM_HAVE_CUDAFORSTORM
-				if (!resetCudaDevice()) {
-					LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
-					throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
-				}
-
-				LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
-				LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
-				LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
-				
-				std::vector<ValueType> copyX(*currentX);
-				if (minimize) {
-					basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
-				}
-				else {
-					basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
-				}
-
-				localIterations = 0;
-				converged = false;
-				while (!converged && localIterations < this->maximalNumberOfIterations) {
-					// Compute x' = A*x + b.
-					sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
-					storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
-
-					//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
-					//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+					if (!resetCudaDevice()) {
+						LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+						throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
+					}

-					/*
-					Versus:
-					A.multiplyWithVector(*currentX, *multiplyResult);
-					storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
-					*/
+					LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
+					LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
+					LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());

-					// Reduce the vector x' by applying min/max for all non-deterministic choices.
+					std::vector<ValueType> copyX(*currentX);
 					if (minimize) {
-						storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
 					}
 					else {
-						storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, copyX, sccSubB, sccSubNondeterministicChoiceIndices);
 					}
+					converged = true;
+
+					// DEBUG
+					localIterations = 0;
+					converged = false;
+					while (!converged && localIterations < this->maximalNumberOfIterations) {
+						// Compute x' = A*x + b.
+						sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
+						storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
+
+						//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
+						//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+
+						/*
+						Versus:
+						A.multiplyWithVector(*currentX, *multiplyResult);
+						storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
+						*/
+
+						// Reduce the vector x' by applying min/max for all non-deterministic choices.
+						if (minimize) {
+							storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						}
+						else {
+							storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						}

-					// Determine whether the method converged.
-					// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
-					// running time. In fact, it is faster. This has to be investigated.
-					// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
-					converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
-
-					// Update environment variables.
-					std::swap(currentX, swap);
+						// Determine whether the method converged.
+						// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
+						// running time. In fact, it is faster. This has to be investigated.
+						// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
+						converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);

-					++localIterations;
-					++globalIterations;
-				}
-				LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+						// Update environment variables.
+						std::swap(currentX, swap);

-				uint_fast64_t diffCount = 0;
-				for (size_t i = 0; i < currentX->size(); ++i) {
-					if (currentX->at(i) != copyX.at(i)) {
-						LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i));
-						std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl;
-					}
-				}
-#else
-				localIterations = 0;
-				converged = false;
-				while (!converged && localIterations < this->maximalNumberOfIterations) {
-					// Compute x' = A*x + b.
-					sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
-					storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
-
-					//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
-					//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
-
-					/*
-					Versus:
-					A.multiplyWithVector(*currentX, *multiplyResult);
-					storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
-					*/
-
-					// Reduce the vector x' by applying min/max for all non-deterministic choices.
-					if (minimize) {
-						storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						++localIterations;
+						++globalIterations;
 					}
-					else {
-						storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+
+					uint_fast64_t diffCount = 0;
+					for (size_t i = 0; i < currentX->size(); ++i) {
+						if (currentX->at(i) != copyX.at(i)) {
+							LOG4CPLUS_WARN(logger, "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i));
+							std::cout << "CUDA solution differs on index " << i << " diff. " << std::abs(currentX->at(i) - copyX.at(i)) << ", CPU: " << currentX->at(i) << ", CUDA: " << copyX.at(i) << std::endl;
+							++diffCount;
+						}
 					}
+					std::cout << "CUDA solution differed in " << diffCount << " of " << currentX->size() << " values." << std::endl;
+#endif
+				} else {
+					localIterations = 0;
+					converged = false;
+					while (!converged && localIterations < this->maximalNumberOfIterations) {
+						// Compute x' = A*x + b.
+						sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
+						storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
+
+						//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
+						//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+
+						/*
+						Versus:
+						A.multiplyWithVector(*currentX, *multiplyResult);
+						storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
+						*/
+
+						// Reduce the vector x' by applying min/max for all non-deterministic choices.
+						if (minimize) {
+							storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						}
+						else {
+							storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						}

-					// Determine whether the method converged.
-					// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
-					// running time. In fact, it is faster. This has to be investigated.
-					// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
-					converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
+						// Determine whether the method converged.
+						// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
+						// running time. In fact, it is faster. This has to be investigated.
+						// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
+						converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);

-					// Update environment variables.
-					std::swap(currentX, swap);
+						// Update environment variables.
+						std::swap(currentX, swap);

-					++localIterations;
-					++globalIterations;
+						++localIterations;
+						++globalIterations;
+					}
+					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
 				}
-				LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
-#endif
+

 				// The Result of this SCC has to be taken back into the main result vector
 				innerIndex = 0;
@ -263,6 +274,72 @@ namespace storm {
 			}
        }

+		template<typename ValueType>
+		std::vector<std::pair<bool, std::vector<uint_fast64_t>>> 
+			TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
+				std::vector<std::pair<bool, std::vector<uint_fast64_t>>> result;
+#ifdef STORM_HAVE_CUDAFORSTORM
+				// 95% to have a bit of padding
+				size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
+				size_t lastResultIndex = 0;
+
+				std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
+				size_t currentSize = 0;
+				for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
+					storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+
+					uint_fast64_t rowCount = 0;
+					uint_fast64_t entryCount = 0;
+					std::vector<uint_fast64_t> rowGroups;
+					rowGroups.reserve(scc.size());
+
+					for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
+						rowCount += matrix.getRowGroupSize(*sccIt);
+						entryCount += matrix.getRowGroupEntryCount(*sccIt);
+						rowGroups.push_back(*sccIt);
+					}
+
+					size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
+
+					if ((currentSize + sccSize) <= cudaFreeMemory) {
+						// There is enough space left in the current group
+
+						if (currentSize == 0) {
+							result.push_back(std::make_pair(true, rowGroups));
+						}
+						else {
+							result[lastResultIndex].second.insert(result[lastResultIndex].second.end(), rowGroups.begin(), rowGroups.end());
+						}
+						currentSize += sccSize;
+					}
+					else {
+						if (sccSize <= cudaFreeMemory) {
+							++lastResultIndex;
+							result.push_back(std::make_pair(true, rowGroups));
+							currentSize = sccSize;
+						}
+						else {
+							// This group is too big to fit into the CUDA Memory by itself
+							lastResultIndex += 2;
+							result.push_back(std::make_pair(false, rowGroups));
+							currentSize = 0;
+						}
+					}
+				}
+#else
+				for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
+					storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+					std::vector<uint_fast64_t> rowGroups;
+					rowGroups.reserve(scc.size());
+					for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
+						rowGroups.push_back(*sccIt);
+						result.push_back(std::make_pair(false, rowGroups));
+					}
+				}
+#endif
+			return result;
+		}
+
        // Explicitly instantiate the solver.
 		template class TopologicalValueIterationNondeterministicLinearEquationSolver<double>;
    } // namespace solver
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
@ -2,6 +2,11 @@
 #define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_

 #include "src/solver/NativeNondeterministicLinearEquationSolver.h"
+#include "src/storage/StronglyConnectedComponentDecomposition.h"
+#include "src/storage/SparseMatrix.h"
+
+#include <utility>
+#include <vector>

 namespace storm {
    namespace solver {
@ -30,7 +35,12 @@ namespace storm {
            
            virtual NondeterministicLinearEquationSolver<ValueType>* clone() const override;
            
-            virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
+            virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
+		private:
+			/*!
+			 * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
+			 */
+			std::vector<std::pair<bool, std::vector<uint_fast64_t>>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
        };
    } // namespace solver
 } // namespace storm
--- a/src/storage/SparseMatrix.cpp
+++ b/src/storage/SparseMatrix.cpp
@ -330,6 +330,15 @@ namespace storm {
            return entryCount;
        }
        
+		template<typename T>
+		uint_fast64_t SparseMatrix<T>::getRowGroupEntryCount(uint_fast64_t const group) const {
+			uint_fast64_t result = 0;
+			for (uint_fast64_t row = this->getRowGroupIndices()[group]; row < this->getRowGroupIndices()[group + 1]; ++row) {
+				result += (this->rowIndications[row + 1] - this->rowIndications[row]);
+			}
+			return result;
+		}
+
        template<typename T>
        uint_fast64_t SparseMatrix<T>::getRowGroupCount() const {
            return rowGroupIndices.size() - 1;
--- a/src/storage/SparseMatrix.h
+++ b/src/storage/SparseMatrix.h
@ -344,6 +344,13 @@ namespace storm {
             * @return The number of entries in the matrix.
             */
            uint_fast64_t getEntryCount() const;
+
+			/*!
+			* Returns the number of entries in the given row group of the matrix.
+			*
+			* @return The number of entries in the given row group of the matrix.
+			*/
+			uint_fast64_t getRowGroupEntryCount(uint_fast64_t const group) const;
            
            /*!
             * Returns the number of row groups in the matrix.