Implemented a basic but complete kernel for value iteration in CUDA.

It doesnt work :( Former-commit-id: 6a3a7aa505
12 years ago · c0a7e42486
8 changed files with 477 additions and 22 deletions
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -1,4 +1,5 @@
 #include "basicValueIteration.h"
+#define CUSP_USE_TEXTURE_MEMORY

 #include <iostream>
 #include <chrono>
@ -6,54 +7,111 @@
 #include <cuda_runtime.h>
 #include "cusparse_v2.h"

+#include "utility.h"
+
 #include "cuspExtension.h"
+#include <thrust/transform.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+
+
+#define CUDA_CHECK_ALL_ERRORS() do { \
+	cudaError_t errSync  = cudaGetLastError(); \
+	cudaError_t errAsync = cudaDeviceSynchronize(); \
+	if (errSync != cudaSuccess) { \
+		std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \
+	} \
+	if (errAsync != cudaSuccess) { \
+		std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
+	} } while(false)

 __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
 	*B = *A;
 }

-template <typename IndexType, typename ValueType>
-void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
+template<typename T, bool Relative>
+struct equalModuloPrecision : public thrust::binary_function<T,T,T>
+{
+__host__ __device__ T operator()(const T &x, const T &y) const
+{
+    if (Relative) {
+		const T result = (x - y) / y;
+		return (result > 0) ? result : -result;
+    } else {
+        const T result = (x - y);
+		return (result > 0) ? result : -result;
+    }
+}
+};
+
+template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
+void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
 	IndexType* device_matrixRowIndices = nullptr;
 	IndexType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 	IndexType* device_nondeterministicChoiceIndices = nullptr;

+	std::cout.sync_with_stdio(true);
+	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
+	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
+	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
+
+	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
+	const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
+	const IndexType matrixNnzCount = columnIndicesAndValues.size();
+
 	cudaError_t cudaMallocResult;

-	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixRowIndices, matrixRowIndices.size());
+	bool converged = false;
+	uint_fast64_t iterationCount = 0;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<ValueType>(&device_x, x.size());
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<ValueType>(&device_b, b.size());
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<ValueType>(&device_multiplyResult, b.size());
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<IndexType>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixRowCount + 1));
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
@ -62,38 +120,99 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;

-	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice);
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice);
+	// Preset the xSwap to zeros...
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
+	// Preset the multiplyResult to zeros...
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

 	// Data is on device, start Kernel
+	while (!converged && iterationCount < maxIterationCount)
+	{ // In a sub-area since transfer of control via label evades initialization
+		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+		CUDA_CHECK_ALL_ERRORS();
+
+		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
+		thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
+
+		// Transform: Add multiplyResult + b inplace to multiplyResult
+		thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Reduce: Reduce multiplyResult to a new x vector
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Check for convergence
+		// Transform: x = abs(x - xSwap)/ xSwap
+		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
+		thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
+		thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
+		thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
+		CUDA_CHECK_ALL_ERRORS();

+		// Reduce: get Max over x and check for res < Precision
+		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+		converged = maxX < precision;
+		++iterationCount;
+
+		// Swap pointers, device_x always contains the most current result
+		std::swap(device_x, device_xSwap);
+	}
+	std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
+
+	// Get x back from the device
+	cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}

 	// All code related to freeing memory and clearing up the device
 cleanup:
@ -118,6 +237,13 @@ cleanup:
 		}
 		device_x = nullptr;
 	}
+	if (device_xSwap != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_xSwap);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_xSwap = nullptr;
+	}
 	if (device_b != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_b);
 		if (cudaFreeResult != cudaSuccess) {
@ -150,6 +276,18 @@ void cudaForStormTestFunction(int a, int b) {
 	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
 }

-void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
-	basicValueIteration_mvReduce<uint_fast64_t, double>(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
+	if (relativePrecisionCheck) {
+		basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+	} else {
+		basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+	}
+}
+
+void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
+	if (relativePrecisionCheck) {
+		basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+	} else {
+		basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+	}
 }
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -1,3 +1,6 @@
+#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
+#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
+
 #include <cstdint>
 #include <vector>
 #include <utility>
@ -6,4 +9,7 @@
 #include "cudaForStorm_Export.h"

 cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
-cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
+cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
+cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
+
+#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/resources/cudaForStorm/srcCuda/cudaForStorm.h
+++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h
@ -8,6 +8,9 @@
 // TopologicalValueIteration
 #include "srcCuda/basicValueIteration.h"

+// Utility Functions
+#include "srcCuda/utility.h"
+



--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -25,6 +25,8 @@
 #pragma once

 #include <cusp/detail/device/spmv/csr_vector.h>
+#include <limits>
+#include <algorithm>

 namespace cusp
 {
@ -33,6 +35,261 @@ namespace detail
 namespace device
 {

+//////////////////////////////////////////////////////////////////////////////
+// CSR SpMV kernels based on a vector model (one warp per row)
+//////////////////////////////////////////////////////////////////////////////
+//
+// spmv_csr_vector_device
+//   Each row of the CSR matrix is assigned to a warp.  The warp computes
+//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
+//   the x vector, in parallel.  This division of work implies that 
+//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
+//   manner (but generally not aligned).  On GT200 these accesses are
+//   coalesced, unlike kernels based on the one-row-per-thread division of 
+//   work.  Since an entire 32-thread warp is assigned to each row, many 
+//   threads will remain idle when their row contains a small number 
+//   of elements.  This code relies on implicit synchronization among 
+//   threads in a warp.
+//
+// spmv_csr_vector_tex_device
+//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
+//   used for accessing the x vector.
+//  
+//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
+
+
+template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
+__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
+__global__ void
+storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
+{
+    __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
+    
+    const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
+
+    const IndexType thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
+    const IndexType vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
+    const IndexType vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
+    const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(IndexType row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
+
+        const IndexType row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const IndexType row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local sum
+        ValueType sum = 0;
+     
+        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end)
+                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+
+            // accumulate local sums
+            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
+                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+        }
+        else
+        {
+            // accumulate local sums
+            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
+                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = sum;
+        
+        // reduce local sums to row sum
+        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
+        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
+        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
+        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
+        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            y[row] = sdata[threadIdx.x];
+    }
+}
+
+template <typename IndexType, typename ValueType, unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
+__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
+__global__ void
+storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer)
+{
+    __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2];
+    
+    const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
+
+    const IndexType thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
+    const IndexType vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
+    const IndexType vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
+    const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(IndexType row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
+
+        const IndexType row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const IndexType row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local Min/Max
+        ValueType localMinMaxElement = minMaxInitializer;
+
+        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				if(Minimize) {
+					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+			}
+
+            // accumulate local sums
+            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+        else
+        {
+            // accumulate local sums
+            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = localMinMaxElement;
+        
+        // reduce local min/max to row min/max
+		if (Minimize) {
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
+		} else {
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
+		}
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            x[row] = sdata[threadIdx.x];
+    }
+}
+
+template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
+void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
+{
+	ValueType __minMaxInitializer = 0;
+	if (Minimize) {
+		__minMaxInitializer = std::numeric_limits<ValueType>::max();
+	}
+	const ValueType minMaxInitializer = __minMaxInitializer;
+
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+
+    storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
+}
+
+template <bool Minimize, typename IndexType, typename ValueType>
+void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
+{
+    const IndexType rows_per_group = num_entries / num_rows;
+
+    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    
+    __storm_cuda_opt_vector_reduce<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+}
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
+void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+template <typename IndexType, typename ValueType>
+void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+{
+    const IndexType nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    
+    __storm_cuda_opt_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+template <typename IndexType, typename ValueType>
+void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+{
+    const IndexType nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+
+    __storm_cuda_opt_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+// NON-OPT
+
 template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
 void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
--- a/resources/cudaForStorm/srcCuda/utility.cu
+++ b/resources/cudaForStorm/srcCuda/utility.cu
@ -1,3 +1,7 @@
+#include "utility.h"
+
+#include <cuda_runtime.h>
+
 size_t getFreeCudaMemory() {
 	size_t freeMemory;
 	size_t totalMemory;
@ -14,6 +18,16 @@ size_t getTotalCudaMemory() {
 	return totalMemory;
 }

-void resetCudaDevice() {
-	cudaDeviceReset();
+bool resetCudaDevice() {
+	cudaError_t result = cudaDeviceReset();
+	return (result == cudaSuccess);
+}
+
+int getRuntimeCudaVersion() {
+	int result = -1;
+	cudaError_t errorResult = cudaRuntimeGetVersion(&result);
+	if (errorResult != cudaSuccess) {
+		return -1;
+	}
+	return result;
 }
--- a/resources/cudaForStorm/srcCuda/utility.h
+++ b/resources/cudaForStorm/srcCuda/utility.h
@ -1,3 +1,12 @@
-size_t getFreeCudaMemory();
-size_t getTotalCudaMemory();
-void resetCudaDevice();
+#ifndef STORM_CUDAFORSTORM_UTILITY_H_
+#define STORM_CUDAFORSTORM_UTILITY_H_
+
+// Library exports
+#include "cudaForStorm_Export.h"
+
+cudaForStorm_EXPORT size_t getFreeCudaMemory();
+cudaForStorm_EXPORT size_t getTotalCudaMemory();
+cudaForStorm_EXPORT bool resetCudaDevice();
+cudaForStorm_EXPORT int getRuntimeCudaVersion();
+
+#endif // STORM_CUDAFORSTORM_UTILITY_H_
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -13,6 +13,9 @@
 #include "log4cplus/loggingmacros.h"
 extern log4cplus::Logger logger;

+#include "storm-config.h"
+#include "cudaForStorm.h"
+
 namespace storm {
    namespace solver {
        
@ -80,6 +83,7 @@ namespace storm {

 			for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) {
 				storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+				std::cout << "SCC Index: " << *sccIndexIt << std::endl;

 				// Generate a submatrix
 				storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
@ -121,6 +125,7 @@ namespace storm {
 				}

 				// For the current SCC, we need to perform value iteration until convergence.
+#ifndef STORM_HAVE_CUDAFORSTORM
 				localIterations = 0;
 				converged = false;
 				while (!converged && localIterations < this->maximalNumberOfIterations) {
@ -157,6 +162,23 @@ namespace storm {
 					++localIterations;
 					++globalIterations;
 				}
+				std::cout << "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations." << std::endl;
+#else
+				if (!resetCudaDevice()) {
+					std::cout << "Could not reset CUDA Device!" << std::endl;
+				}
+				std::cout << "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
+				size_t memSize = sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size();
+				std::cout << "We will allocate " << memSize << " Bytes." << std::endl;
+				std::cout << "The CUDA Runtime Version is " << getRuntimeCudaVersion() << std::endl;
+
+				if (minimize) {
+					basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices);
+				}
+				else {
+					basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices);
+				}
+#endif

 				// The Result of this SCC has to be taken back into the main result vector
 				innerIndex = 0;
@ -165,7 +187,7 @@ namespace storm {
 					++innerIndex;
 				}

-				// Since the pointers for swapping in the calculation point to temps they should not be valide anymore
+				// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
 				currentX = nullptr;
 				swap = nullptr;

--- a/src/storage/SparseMatrix.h
+++ b/src/storage/SparseMatrix.h
@ -19,6 +19,11 @@ namespace storm {
 		class EigenAdapter;
 		class StormAdapter;
 	}
+
+	namespace solver {
+		template<typename T>
+		class TopologicalValueIterationNondeterministicLinearEquationSolver;
+	}
 }

 namespace storm {
@ -142,6 +147,7 @@ namespace storm {
            friend class storm::adapters::GmmxxAdapter;
            friend class storm::adapters::EigenAdapter;
            friend class storm::adapters::StormAdapter;
+			friend class storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver<T>;
            
            typedef typename std::vector<std::pair<uint_fast64_t, T>>::iterator iterator;
            typedef typename std::vector<std::pair<uint_fast64_t, T>>::const_iterator const_iterator;