Refactored the CUDA Kernel to once again use the "hacked" combination of column indices and values with a bit of reinterpret_cast magic.

Refactored the CUDA-SCC grouping algorithm as is took 80x longer to calculate the groups than it took to calculate the entire solution. Former-commit-id: 5a5ffabe38
12 years ago · 26500ff4a8
3 changed files with 104 additions and 117 deletions
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -58,13 +58,12 @@ void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVect

 template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
 void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
-	std::vector<IndexType> matrixColumnIndices;
-	std::vector<ValueType> matrixValues;
-	exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
+	//std::vector<IndexType> matrixColumnIndices;
+	//std::vector<ValueType> matrixValues;
+	//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);

 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndices = nullptr;
-	ValueType* device_matrixValues = nullptr;
+	IndexType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
@ -96,16 +95,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
+		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

@ -159,16 +151,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
+		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

@ -216,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	// Data is on device, start Kernel
 	while (!converged && iterationCount < maxIterationCount)
 	{ // In a sub-area since transfer of control via label evades initialization
-		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
+		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();

 		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -272,19 +257,12 @@ cleanup:
 		}
 		device_matrixRowIndices = nullptr;
 	}
-	if (device_matrixColIndices != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
+	if (device_matrixColIndicesAndValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
 		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
+			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
 		}
-		device_matrixColIndices = nullptr;
-	}
-	if (device_matrixValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_matrixValues = nullptr;
+		device_matrixColIndicesAndValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
@ -328,13 +306,8 @@ cleanup:

 template <typename IndexType, typename ValueType>
 void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
-	std::vector<IndexType> matrixColumnIndices;
-	std::vector<ValueType> matrixValues;
-	exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
-
 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndices = nullptr;
-	ValueType* device_matrixValues = nullptr;
+	IndexType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_multiplyResult = nullptr;

@ -359,16 +332,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
+		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

@ -401,16 +367,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
+		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

@ -433,7 +392,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
 #endif

-	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
+	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 	CUDA_CHECK_ALL_ERRORS();

 #ifdef DEBUG
@ -460,19 +419,12 @@ cleanup:
 		}
 		device_matrixRowIndices = nullptr;
 	}
-	if (device_matrixColIndices != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_matrixColIndices = nullptr;
-	}
-	if (device_matrixValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
+	if (device_matrixColIndicesAndValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
 		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
+			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
 		}
-		device_matrixValues = nullptr;
+		device_matrixColIndicesAndValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -61,7 +61,7 @@ namespace device
 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
 __global__ void
-storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y)
+storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
 {
    __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
    __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType

            // accumulate local sums
            if(jj >= row_start && jj < row_end)
-                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
+                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);

            // accumulate local sums
            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
+                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
        }
        else
        {
            // accumulate local sums
            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
+                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
        }

        // store local sum in shared memory
@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_
 }

 template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
-void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType
        bind_x(x);

    storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);

    if (UseCache)
        unbind_x(x);
 }

 template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;

-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    
-    __storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+    __storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }

 template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;

-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }

-    __storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+    __storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }

 // NON-OPT
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -1,7 +1,6 @@
 #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"

 #include <utility>
-#include <set>

 #include "src/settings/Settings.h"
 #include "src/utility/vector.h"
@ -46,7 +45,6 @@ namespace storm {
        
        template<typename ValueType>
 		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
-            
 			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
 			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
 			storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
@ -219,57 +217,94 @@ namespace storm {
 				size_t lastResultIndex = 0;

 				std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
+
+				size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount()));
+				size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1));
+				size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup;
+
 				size_t currentSize = 0;
-				for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
-					storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+				size_t neededReserveSize = 0;
+				size_t startIndex = 0;
+				for (size_t i = 0; i < topologicalSort.size(); ++i) {
+					storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]];
+					size_t const currentSccSize = scc.size();

 					uint_fast64_t rowCount = 0;
 					uint_fast64_t entryCount = 0;
-					storm::storage::StateBlock rowGroups;
-					rowGroups.reserve(scc.size());

 					for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
 						rowCount += matrix.getRowGroupSize(*sccIt);
 						entryCount += matrix.getRowGroupEntryCount(*sccIt);
-						rowGroups.insert(*sccIt);
 					}

 					size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));

 					if ((currentSize + sccSize) <= cudaFreeMemory) {
 						// There is enough space left in the current group
+						neededReserveSize += currentSccSize;
+						currentSize += sccSize;
+					} else {
+						// This would make the last open group to big for the GPU

-						if (currentSize == 0) {
-							result.push_back(std::make_pair(true, rowGroups));
+						if (startIndex < i) {
+							if ((startIndex + 1) < i) {
+								// More than one component
+								std::vector<uint_fast64_t> tempGroups;
+								tempGroups.reserve(neededReserveSize);
+								for (size_t j = startIndex; j < i; ++j) {
+									storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+									tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
 								}
-						else {
-							result[lastResultIndex].second.insert(rowGroups.begin(), rowGroups.end());
+								std::sort(tempGroups.begin(), tempGroups.end());
+
+								result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
+							} else {
+								// Only one group, copy construct.
+								result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
 							}
-						currentSize += sccSize;
+							++lastResultIndex;
 						}
-					else {
+
 						if (sccSize <= cudaFreeMemory) {
-							++lastResultIndex;
-							result.push_back(std::make_pair(true, rowGroups));
 							currentSize = sccSize;
-						}
-						else {
+							neededReserveSize = currentSccSize;
+							startIndex = i;
+						} else {
 							// This group is too big to fit into the CUDA Memory by itself
-							lastResultIndex += 2;
-							result.push_back(std::make_pair(false, rowGroups));
+							result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]]))));
+							++lastResultIndex;
+
 							currentSize = 0;
+							neededReserveSize = 0;
+							startIndex = i + 1;
+						}
 					}
 				}
+
+				size_t const topologicalSortSize = topologicalSort.size();
+				if (startIndex < topologicalSortSize) {
+					if ((startIndex + 1) < topologicalSortSize) {
+						// More than one component
+						std::vector<uint_fast64_t> tempGroups;
+						tempGroups.reserve(neededReserveSize);
+						for (size_t j = startIndex; j < topologicalSortSize; ++j) {
+							storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+							tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
+						}
+						std::sort(tempGroups.begin(), tempGroups.end());
+
+						result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
+					}
+					else {
+						// Only one group, copy construct.
+						result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
+					}
+					++lastResultIndex;
 				}
 #else
 				for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
 					storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
-					storm::storage::StateBlock rowGroups;
-					rowGroups.reserve(scc.size());
-					for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
-						rowGroups.insert(*sccIt);
-					}
-					result.push_back(std::make_pair(false, rowGroups));
+					result.push_back(std::make_pair(false, scc));
 				}
 #endif
 			return result;