diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
index eee59b007..65eb47bdd 100644
--- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
@@ -65,7 +65,7 @@ namespace device
 template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
 __global__ void
-storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y)
+storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
 {
     __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
     __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
@@ -135,7 +135,7 @@ storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const
 template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
 __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
 __global__ void
-storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer)
+storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
 {
     __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
     __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
index c3e543433..65451d743 100644
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@@ -1,6 +1,7 @@
 #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"
 
 #include <utility>
+#include <chrono>
 
 #include "src/settings/Settings.h"
 #include "src/utility/vector.h"
@@ -42,386 +43,283 @@ namespace storm {
 		NondeterministicLinearEquationSolver<ValueType>* TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::clone() const {
 			return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this);
         }
+        
+        template<typename ValueType>
+		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
+			
+#ifdef GPU_USE_FLOAT
+#define __FORCE_FLOAT_CALCULATION true
+#else
+#define __FORCE_FLOAT_CALCULATION false
+#endif
+			if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) {
+				TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
 
-		template<>
-		void TopologicalValueIterationNondeterministicLinearEquationSolver<float>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<float> const& A, std::vector<float>& x, std::vector<float> const& b, std::vector<float>* multiplyResult, std::vector<float>* newX) const {
-			// For testing only
-			std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
-			LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>");
+				storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
+				std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
+				std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
 
-			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
-			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
-			storm::models::NonDeterministicMatrixBasedPseudoModel<float> pseudoModel(A, nondeterministicChoiceIndices);
-			storm::storage::StronglyConnectedComponentDecomposition<float> sccDecomposition(pseudoModel, false, false);
-
-			if (sccDecomposition.size() == 0) {
-				LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
-				throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
-			}
+				tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
 
-			storm::storage::SparseMatrix<float> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
-			std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
-
-			// Calculate the optimal distribution of sccs
-			std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
-			LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
-
-			std::vector<float>* currentX = nullptr;
-			std::vector<float>* swap = nullptr;
-			size_t currentMaxLocalIterations = 0;
-			size_t localIterations = 0;
-			size_t globalIterations = 0;
-			bool converged = true;
-
-			// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
-			// solved after all SCCs it depends on have been solved.
-			int counter = 0;
-
-			for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
-				bool const useGpu = sccIndexIt->first;
-				storm::storage::StateBlock const& scc = sccIndexIt->second;
-
-				// Generate a sub matrix
-				storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
-				storm::storage::SparseMatrix<float> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
-				std::vector<float> sccSubB(sccSubmatrix.getRowCount());
-				storm::utility::vector::selectVectorValues<float>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
-				std::vector<float> sccSubX(sccSubmatrix.getColumnCount());
-				std::vector<float> sccSubXSwap(sccSubmatrix.getColumnCount());
-				std::vector<float> sccMultiplyResult(sccSubmatrix.getRowCount());
-
-				// Prepare the pointers for swapping in the calculation
-				currentX = &sccSubX;
-				swap = &sccSubXSwap;
-
-				storm::utility::vector::selectVectorValues<float>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
-				std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
-				sccSubNondeterministicChoiceIndices.at(0) = 0;
-
-				// Pre-process all dependent states
-				// Remove outgoing transitions and create the ChoiceIndices
-				uint_fast64_t innerIndex = 0;
-				uint_fast64_t outerIndex = 0;
-				for (uint_fast64_t state : scc) {
-					// Choice Indices
-					sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
-
-					for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
-						storm::storage::SparseMatrix<float>::const_rows row = A.getRow(rowGroupIt);
-						for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
-							if (!subMatrixIndices.get(rowIt->getColumn())) {
-								// This is an outgoing transition of a state in the SCC to a state not included in the SCC
-								// Subtracting Pr(tau) * x_other from b fixes that
-								sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
-							}
-						}
-						++innerIndex;
-					}
-					++outerIndex;
+				for (size_t i = 0, size = new_x.size(); i < size; ++i) {
+					x.at(i) = new_x.at(i);
 				}
+				return;
+			}
+			
+			// For testing only
+			if (sizeof(ValueType) == sizeof(double)) {
+				std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
+				LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>");
+			} else {
+				std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
+				LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>");
+			}
 
-				// For the current SCC, we need to perform value iteration until convergence.
-				if (useGpu) {
+			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
+			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
+			storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> const pseudoModel(A, nondeterministicChoiceIndices);
+			
+			// Check if the decomposition is necessary
 #ifdef STORM_HAVE_CUDAFORSTORM
-					if (!resetCudaDevice()) {
-						LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
-						throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
-					}
-
-					//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
-					//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
-					//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
-
-					bool result = false;
-					localIterations = 0;
-					if (minimize) {
-						result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
-					} else {
-						result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
-					}
-					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
-
-					if (!result) {
-						converged = false;
-						LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
-						throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
-					} else {
-						converged = true;
-					}
-
-					// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
-					// track of the maximum.
-					if (localIterations > currentMaxLocalIterations) {
-						currentMaxLocalIterations = localIterations;
-					}
-
+#define __USE_CUDAFORSTORM_OPT true
+			size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A.getEntryCount()));
+			size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
 #else
-					LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
-					throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+#define __USE_CUDAFORSTORM_OPT false
+			size_t const gpuSizeOfCompleteSystem = 0;
+			size_t const cudaFreeMemory = 0;
 #endif
-				} else {
-					localIterations = 0;
-					converged = false;
-					while (!converged && localIterations < this->maximalNumberOfIterations) {
-						// Compute x' = A*x + b.
-						sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
-						storm::utility::vector::addVectorsInPlace<float>(sccMultiplyResult, sccSubB);
-
-						//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
-						//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+			std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition;
+			if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) {
+				// Dummy output for SCC Times
+				std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
 
-						/*
-						Versus:
-						A.multiplyWithVector(*currentX, *multiplyResult);
-						storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
-						*/
-
-						// Reduce the vector x' by applying min/max for all non-deterministic choices.
-						if (minimize) {
-							storm::utility::vector::reduceVectorMin<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
-						} else {
-							storm::utility::vector::reduceVectorMax<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
-						}
-
-						// Determine whether the method converged.
-						// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
-						// running time. In fact, it is faster. This has to be investigated.
-						// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
-						converged = storm::utility::vector::equalModuloPrecision<float>(*currentX, *swap, this->precision, this->relative);
-
-						// Update environment variables.
-						std::swap(currentX, swap);
-
-						++localIterations;
-						++globalIterations;
-					}
-					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+#ifdef STORM_HAVE_CUDAFORSTORM
+				if (!resetCudaDevice()) {
+					LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+					throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
 				}
 
-
-				// The Result of this SCC has to be taken back into the main result vector
-				innerIndex = 0;
-				for (uint_fast64_t state : scc) {
-					x.at(state) = currentX->at(innerIndex);
-					++innerIndex;
+				std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
+				bool result = false;
+				size_t globalIterations = 0;
+				if (minimize) {
+					result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
+				} else {
+					result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
 				}
+				LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
 
-				// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
-				currentX = nullptr;
-				swap = nullptr;
-
-				// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
-				// track of the maximum.
-				if (localIterations > currentMaxLocalIterations) {
-					currentMaxLocalIterations = localIterations;
+				bool converged = false;
+				if (!result) {
+					converged = false;
+					LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
+					throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
+				} else {
+					converged = true;
 				}
-			}
-
-			// Check if the solver converged and issue a warning otherwise.
-			if (converged) {
-				LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
-			} else {
-				LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
-			}
-		}
-        
-        template<typename ValueType>
-		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
-			
-#ifndef GPU_USE_DOUBLE
-			TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
 
-			storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
-			std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
-			std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
+				std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
 
-			tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
+				std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl;
 
-			for (size_t i = 0, size = new_x.size(); i < size; ++i) {
-				x.at(i) = new_x.at(i);
-			}
+				// Check if the solver converged and issue a warning otherwise.
+				if (converged) {
+					LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations.");
+				} else {
+					LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations.");
+				}
 #else
-			// For testing only
-			std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
-			LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>");
-
-			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
-			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
-			storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
-			storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
+				LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
+				throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+#endif
+			} else {
+				std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now();
+				storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
 
-			if (sccDecomposition.size() == 0) {
-				LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
-				throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
-			}
+				if (sccDecomposition.size() == 0) {
+					LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
+					throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
+				}
 
-			storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
-			std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
-
-			// Calculate the optimal distribution of sccs
-			std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
-			LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
-
-			std::vector<ValueType>* currentX = nullptr;
-			std::vector<ValueType>* swap = nullptr;
-			size_t currentMaxLocalIterations = 0;
-			size_t localIterations = 0;
-			size_t globalIterations = 0;
-			bool converged = true;
-
-			// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
-			// solved after all SCCs it depends on have been solved.
-			int counter = 0;
-
-			for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
-				bool const useGpu = sccIndexIt->first;
-				storm::storage::StateBlock const& scc = sccIndexIt->second;
-
-				// Generate a sub matrix
-				storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
-				storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
-				std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
-				storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
-				std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
-				std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
-                std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
-                
-				// Prepare the pointers for swapping in the calculation
-				currentX = &sccSubX;
-				swap = &sccSubXSwap;
-
-				storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
-				std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
-				sccSubNondeterministicChoiceIndices.at(0) = 0;
-
-				// Pre-process all dependent states
-				// Remove outgoing transitions and create the ChoiceIndices
-				uint_fast64_t innerIndex = 0;
-                uint_fast64_t outerIndex = 0;
-				for (uint_fast64_t state: scc) {
-					// Choice Indices
-					sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
-
-					for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
-						typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt);
-						for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
-							if (!subMatrixIndices.get(rowIt->getColumn())) {
-								// This is an outgoing transition of a state in the SCC to a state not included in the SCC
-								// Subtracting Pr(tau) * x_other from b fixes that
-								sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
+				storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
+				std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
+
+				// Calculate the optimal distribution of sccs
+				std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
+				LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
+
+				std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast<std::chrono::milliseconds>(sccEndTime - sccStartTime).count() << "ms." << std::endl;
+
+				std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
+
+				std::vector<ValueType>* currentX = nullptr;
+				std::vector<ValueType>* swap = nullptr;
+				size_t currentMaxLocalIterations = 0;
+				size_t localIterations = 0;
+				size_t globalIterations = 0;
+				bool converged = true;
+
+				// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
+				// solved after all SCCs it depends on have been solved.
+				int counter = 0;
+
+				for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
+					bool const useGpu = sccIndexIt->first;
+					storm::storage::StateBlock const& scc = sccIndexIt->second;
+
+					// Generate a sub matrix
+					storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
+					storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
+					std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
+					storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
+					std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
+					std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
+					std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
+
+					// Prepare the pointers for swapping in the calculation
+					currentX = &sccSubX;
+					swap = &sccSubXSwap;
+
+					storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
+					std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
+					sccSubNondeterministicChoiceIndices.at(0) = 0;
+
+					// Pre-process all dependent states
+					// Remove outgoing transitions and create the ChoiceIndices
+					uint_fast64_t innerIndex = 0;
+					uint_fast64_t outerIndex = 0;
+					for (uint_fast64_t state : scc) {
+						// Choice Indices
+						sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
+
+						for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
+							typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt);
+							for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
+								if (!subMatrixIndices.get(rowIt->getColumn())) {
+									// This is an outgoing transition of a state in the SCC to a state not included in the SCC
+									// Subtracting Pr(tau) * x_other from b fixes that
+									sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
+								}
 							}
+							++innerIndex;
 						}
-                        ++innerIndex;
+						++outerIndex;
 					}
-                    ++outerIndex;
-				}
 
-				// For the current SCC, we need to perform value iteration until convergence.
-				if (useGpu) {
+					// For the current SCC, we need to perform value iteration until convergence.
+					if (useGpu) {
 #ifdef STORM_HAVE_CUDAFORSTORM
-					if (!resetCudaDevice()) {
-						LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
-						throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
-					}
+						if (!resetCudaDevice()) {
+							LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+							throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
+						}
 
-					//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
-					//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
-					//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
+						//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
+						//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
+						//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
 
-					bool result = false;
-					localIterations = 0;
-					if (minimize) {
-						result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
-					} else {
-						result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
-					}
-					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
-
-					if (!result) {
-						converged = false;
-						LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
-						throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
-					} else {
-						converged = true;
-					}
+						bool result = false;
+						localIterations = 0;
+						if (minimize) {
+							result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+						} else {
+							result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+						}
+						LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
 
-					// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
-					// track of the maximum.
-					if (localIterations > currentMaxLocalIterations) {
-						currentMaxLocalIterations = localIterations;
-					}
+						if (!result) {
+							converged = false;
+							LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
+							throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
+						} else {
+							converged = true;
+						}
 
+						// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+						// track of the maximum.
+						if (localIterations > currentMaxLocalIterations) {
+							currentMaxLocalIterations = localIterations;
+						}
+						globalIterations += localIterations;
 #else
-					LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
-					throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+						LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
+						throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
 #endif
-				} else {
-					localIterations = 0;
-					converged = false;
-					while (!converged && localIterations < this->maximalNumberOfIterations) {
-						// Compute x' = A*x + b.
-						sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
-						storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
+					} else {
+						std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl;
+						localIterations = 0;
+						converged = false;
+						while (!converged && localIterations < this->maximalNumberOfIterations) {
+							// Compute x' = A*x + b.
+							sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
+							storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
+
+							//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
+							//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+
+							/*
+							Versus:
+							A.multiplyWithVector(*currentX, *multiplyResult);
+							storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
+							*/
+
+							// Reduce the vector x' by applying min/max for all non-deterministic choices.
+							if (minimize) {
+								storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+							} else {
+								storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+							}
 
-						//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
-						//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+							// Determine whether the method converged.
+							// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
+							// running time. In fact, it is faster. This has to be investigated.
+							// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
+							converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
 
-						/*
-						Versus:
-						A.multiplyWithVector(*currentX, *multiplyResult);
-						storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
-						*/
+							// Update environment variables.
+							std::swap(currentX, swap);
 
-						// Reduce the vector x' by applying min/max for all non-deterministic choices.
-						if (minimize) {
-							storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+							++localIterations;
+							++globalIterations;
 						}
-						else {
-							storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
-						}
-
-						// Determine whether the method converged.
-						// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
-						// running time. In fact, it is faster. This has to be investigated.
-						// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
-						converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
+						LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+					}
 
-						// Update environment variables.
-						std::swap(currentX, swap);
 
-						++localIterations;
-						++globalIterations;
+					// The Result of this SCC has to be taken back into the main result vector
+					innerIndex = 0;
+					for (uint_fast64_t state : scc) {
+						x.at(state) = currentX->at(innerIndex);
+						++innerIndex;
 					}
-					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
-				}
 
+					// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
+					currentX = nullptr;
+					swap = nullptr;
 
-				// The Result of this SCC has to be taken back into the main result vector
-				innerIndex = 0;
-				for (uint_fast64_t state: scc) {
-					x.at(state) = currentX->at(innerIndex);
-					++innerIndex;
+					// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+					// track of the maximum.
+					if (localIterations > currentMaxLocalIterations) {
+						currentMaxLocalIterations = localIterations;
+					}
 				}
 
-				// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
-				currentX = nullptr;
-				swap = nullptr;
+				std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl;
 
-				// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
-				// track of the maximum.
-				if (localIterations > currentMaxLocalIterations) {
-					currentMaxLocalIterations = localIterations;
+				// Check if the solver converged and issue a warning otherwise.
+				if (converged) {
+					LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
+				} else {
+					LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
 				}
-			}
 
-			// Check if the solver converged and issue a warning otherwise.
-			if (converged) {
-				LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
-			}
-			else {
-				LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
+				std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
 			}
-#endif
         }
 
 		template<typename ValueType>
diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
index 40d9df354..bdbf788cd 100644
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
@@ -8,6 +8,11 @@
 #include <utility>
 #include <vector>
 
+#include "storm-config.h"
+#ifdef STORM_HAVE_CUDAFORSTORM
+#	include "cudaForStorm.h"
+#endif
+
 namespace storm {
     namespace solver {
         
@@ -42,6 +47,50 @@ namespace storm {
 			 */
 			std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
         };
+
+		template <typename ValueType>
+		bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+			//
+			throw;
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_minimize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_minimize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+
+		template <typename ValueType>
+		bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+			//
+			throw;
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_maximize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_maximize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
     } // namespace solver
 } // namespace storm