Fixed include directories for CUDA Plugin in CMakeLists.txt

Refactored all code related to the SPMV kernels to work with float. Wrote a test that determines whether the compiler uses 64bit boundary alignments on std::pairs of uint64 and float. Introduced functions that allow for conversions between different ValueTypes (e.g. from float to double and backwards). Former-commit-id: 830d24064f
11 years ago · ea427fcde1
17 changed files with 1388 additions and 368 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -320,7 +320,7 @@ if (ENABLE_Z3)
    link_directories("${Z3_ROOT}/bin")
 endif()
 if (STORM_USE_CUDAFORSTORM)
-	link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib")
+	link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib")
 endif()
 if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL ""))
 	set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib")
--- a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
@ -0,0 +1,31 @@
+/*
+ * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries
+ */
+ #include <cstdint>
+ #include <utility>
+ #include <vector>
+ 
+ #define CONTAINER_SIZE 100ul
+
+int main(int argc, char* argv[]) {
+	int result = 0;
+
+	std::vector<std::pair<uint_fast64_t, float>> myVector;
+	for (size_t i = 0; i < CONTAINER_SIZE; ++i) {
+		myVector.push_back(std::make_pair(i, 42.12345f * i));
+	}
+	
+	char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first));
+	char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first));
+	ptrdiff_t uintDiff = secondUintPointer - firstUintPointer;
+	
+	if (uintDiff == (2 * sizeof(uint_fast64_t))) {
+		result = 2;
+	} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) {
+		result = 3;
+	} else {
+		result = -5;
+	}
+	
+	return result;
+ }
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -131,6 +131,24 @@ else()
 	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
 endif()

+# Test for Float 64bit Alignment
+try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT
+	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp"
+	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
+)
+if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT)
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
+elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2)
+	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.")
+	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define")
+elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3)
+	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.")
+	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})")
+endif()
+
+
 #
 # Make a version file containing the current version from git.
 #
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -10,20 +10,15 @@
 #include "utility.h"

 #include "cuspExtension.h"
+
 #include <thrust/transform.h>
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>

+#include "storm-cudaplugin-config.h"
+
 #ifdef DEBUG
-#define CUDA_CHECK_ALL_ERRORS() do { \
-	cudaError_t errSync  = cudaGetLastError(); \
-	cudaError_t errAsync = cudaDeviceSynchronize(); \
-	if (errSync != cudaSuccess) { \
-		std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \
-	} \
-	if (errAsync != cudaSuccess) { \
-		std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
-	} } while(false)
+#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync  = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize();	if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
 #else
 #define CUDA_CHECK_ALL_ERRORS() do {} while (false)
 #endif
@ -56,15 +51,16 @@ void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVect
 	}
 }

+// TEMPLATE VERSION
 template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
-bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	//std::vector<IndexType> matrixColumnIndices;
 	//std::vector<ValueType> matrixValues;
 	//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
 	bool errorOccured = false;

 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndicesAndValues = nullptr;
+	ValueType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
@ -74,7 +70,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 #ifdef DEBUG
 	std::cout.sync_with_stdio(true);
 	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
-	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
+	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
 	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
 	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
 #endif
@ -96,12 +92,27 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		goto cleanup;
 	}

-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
+#else
+#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
+#endif
+	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
+		if (cudaMallocResult != cudaSuccess) {
+			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	} else {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
+		if (cudaMallocResult != cudaSuccess) {
+			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
 	}

 	CUDA_CHECK_ALL_ERRORS();
@ -159,12 +170,23 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		goto cleanup;
 	}

-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
+	// Copy all data as floats are expanded to 64bits :/
+	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+		if (cudaCopyResult != cudaSuccess) {
+			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	} else {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+		if (cudaCopyResult != cudaSuccess) {
+			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
 	}

 	CUDA_CHECK_ALL_ERRORS();
@ -214,9 +236,8 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 #endif

 	// Data is on device, start Kernel
-	while (!converged && iterationCount < maxIterationCount)
-	{ // In a sub-area since transfer of control via label evades initialization
-		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+	while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
+		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();

 		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -227,7 +248,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		CUDA_CHECK_ALL_ERRORS();

 		// Reduce: Reduce multiplyResult to a new x vector
-		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();

 		// Check for convergence
@ -338,7 +359,7 @@ cleanup:
 template <typename IndexType, typename ValueType>
 void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndicesAndValues = nullptr;
+	ValueType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_multiplyResult = nullptr;

@ -362,12 +383,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+#else
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
+#endif

 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
@ -397,12 +427,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+#else
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
+#endif

 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
@ -423,7 +462,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
 #endif

-	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 	CUDA_CHECK_ALL_ERRORS();

 #ifdef DEBUG
@ -601,7 +640,7 @@ void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& group
 	
 	do {
 		// Reduce: Reduce multiplyResult to a new x vector
-		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
 		CUDA_CHECK_ALL_ERRORS();
 	} while (false);

@ -713,7 +752,7 @@ cleanup:
 */

 void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
-	basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
+	basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
 }

 void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
@ -736,6 +775,31 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<dou
 	basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
 }

+// Float
+void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
+	basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
+}
+
+void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
+	basicValueIteration_addVectorsInplace<float>(a, b);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
+	basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
+}
+
+void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
+	basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
+}
+
 bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	if (relativePrecisionCheck) {
 		return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
@ -752,6 +816,22 @@ bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max
 	}
 }

+bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
+bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
 size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
 	size_t const valueTypeSize = sizeof(double);
 	size_t const indexTypeSize = sizeof(uint_fast64_t);
@ -772,5 +852,28 @@ size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t con
 	// Vectors x, xSwap, b, multiplyResult
 	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);

+	return (matrixDataSize + vectorSizes);
+}
+
+size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
+	size_t const valueTypeSize = sizeof(float);
+	size_t const indexTypeSize = sizeof(uint_fast64_t);
+
+	/*
+	IndexType* device_matrixRowIndices = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;
+	*/
+
+	// Row Indices, Column Indices, Values, Choice Indices
+	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
+	// Vectors x, xSwap, b, multiplyResult
+	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
+
 	return (matrixDataSize + vectorSizes);
 }
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -85,6 +85,11 @@ template<typename T>
 cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
 cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
 cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+
+cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+
 cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
 cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
 cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
@ -92,4 +97,11 @@ cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_m
 cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
 cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);

+cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b);
+cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
+
 #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -1,338 +1,47 @@
-/*
- * This is an extension of the original CUSP csr_vector.h SPMV implementation.
- * It is based on the Code and incorporates changes as to cope with the details
- * of the StoRM code.
- * As this is mostly copy & paste, the original license still applies.
- */
-
-/*
- *  Copyright 2008-2009 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
 #pragma once

-#include <cusp/detail/device/spmv/csr_vector.h>
-#include <limits>
-#include <algorithm>
-
-namespace cusp
-{
-namespace detail
-{
-namespace device
-{
-
-//////////////////////////////////////////////////////////////////////////////
-// CSR SpMV kernels based on a vector model (one warp per row)
-//////////////////////////////////////////////////////////////////////////////
-//
-// spmv_csr_vector_device
-//   Each row of the CSR matrix is assigned to a warp.  The warp computes
-//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
-//   the x vector, in parallel.  This division of work implies that 
-//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
-//   manner (but generally not aligned).  On GT200 these accesses are
-//   coalesced, unlike kernels based on the one-row-per-thread division of 
-//   work.  Since an entire 32-thread warp is assigned to each row, many 
-//   threads will remain idle when their row contains a small number 
-//   of elements.  This code relies on implicit synchronization among 
-//   threads in a warp.
-//
-// spmv_csr_vector_tex_device
-//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
-//   used for accessing the x vector.
-//  
-//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
-
-
-template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
-__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
-__global__ void
-storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
-{
-    __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
-    
-    const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
-
-    const IndexType thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
-    const IndexType vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
-    const IndexType vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
-    const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(IndexType row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
-
-        const IndexType row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const IndexType row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local sum
-        ValueType sum = 0;
-     
-        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end)
-                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-
-            // accumulate local sums
-            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-        }
-        else
-        {
-            // accumulate local sums
-            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-        }
-
-        // store local sum in shared memory
-        sdata[threadIdx.x] = sum;
-        
-        // reduce local sums to row sum
-        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
-        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
-        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
-        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
-        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            y[row] = sdata[threadIdx.x];
-    }
-}
-
-template <typename IndexType, typename ValueType, unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
-__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
-__global__ void
-storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer)
-{
-    __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2];
-    
-    const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
-
-    const IndexType thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
-    const IndexType vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
-    const IndexType vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
-    const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(IndexType row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
-
-        const IndexType row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const IndexType row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local Min/Max
-        ValueType localMinMaxElement = minMaxInitializer;
-
-        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end) {
-				if(Minimize) {
-					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-			}
+#include "cuspExtensionFloat.h"
+#include "cuspExtensionDouble.h"

-            // accumulate local sums
-            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
-        else
-        {
-            // accumulate local sums
-            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
+namespace cusp {
+namespace detail {
+namespace device {

-        // store local sum in shared memory
-        sdata[threadIdx.x] = localMinMaxElement;
-        
-        // reduce local min/max to row min/max
-		if (Minimize) {
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
-		} else {
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
-		}
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            x[row] = sdata[threadIdx.x];
-    }
+template <typename ValueType>
+void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) {
+	//
+	throw;
 }
-
-template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
-void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
-{
-	ValueType __minMaxInitializer = -std::numeric_limits<ValueType>::max();
-	if (Minimize) {
-		__minMaxInitializer = std::numeric_limits<ValueType>::max();
-	}
-	const ValueType minMaxInitializer = __minMaxInitializer;
-
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-
-    storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
-}
-
-template <bool Minimize, typename IndexType, typename ValueType>
-void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
-{
-    const IndexType rows_per_group = num_entries / num_rows;
-
-    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    
-    __storm_cuda_opt_vector_reduce<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+template <>
+void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) {
+	storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
-void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
+template <>
+void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) {
+	storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }

-template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
-{
-    const IndexType nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    
-    __storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+template <bool Minimize, typename ValueType>
+void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) {
+	//
+	throw;
 }
-
-template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
-{
-    const IndexType nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-
-    __storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+template <>
+void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
+	storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
-
-// NON-OPT
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
-void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
+template <>
+void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
+	storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }

-template <typename IndexType, typename ValueType>
-void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
-{
-    const IndexType nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    
-    __storm_cuda_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+template <>
+void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
+	storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
-
-template <typename IndexType, typename ValueType>
-void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
-{
-    const IndexType nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-
-    __storm_cuda_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+template <>
+void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
+	storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }

 } // end namespace device
--- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
@ -0,0 +1,361 @@
+/*
+ * This is an extension of the original CUSP csr_vector.h SPMV implementation.
+ * It is based on the Code and incorporates changes as to cope with the details
+ * of the StoRM code.
+ * As this is mostly copy & paste, the original license still applies.
+ */
+
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <algorithm>
+
+#include <math_functions.h>
+
+#include <cusp/detail/device/spmv/csr_vector.h>
+
+namespace cusp
+{
+namespace detail
+{
+namespace device
+{
+
+//////////////////////////////////////////////////////////////////////////////
+// CSR SpMV kernels based on a vector model (one warp per row)
+//////////////////////////////////////////////////////////////////////////////
+//
+// spmv_csr_vector_device
+//   Each row of the CSR matrix is assigned to a warp.  The warp computes
+//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
+//   the x vector, in parallel.  This division of work implies that 
+//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
+//   manner (but generally not aligned).  On GT200 these accesses are
+//   coalesced, unlike kernels based on the one-row-per-thread division of 
+//   work.  Since an entire 32-thread warp is assigned to each row, many 
+//   threads will remain idle when their row contains a small number 
+//   of elements.  This code relies on implicit synchronization among 
+//   threads in a warp.
+//
+// spmv_csr_vector_tex_device
+//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
+//   used for accessing the x vector.
+//  
+//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
+
+
+template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
+__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
+__global__ void
+storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y)
+{
+    __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
+    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local sum
+        double sum = 0;
+     
+        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+			}
+				
+            // accumulate local sums
+            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+			}
+        } else {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+			}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = sum;
+        
+        // reduce local sums to row sum
+        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
+        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
+        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
+        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
+        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            y[row] = sdata[threadIdx.x];
+    }
+}
+
+template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
+__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
+__global__ void
+storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer)
+{
+    __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
+    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local Min/Max
+        double localMinMaxElement = minMaxInitializer;
+
+        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+			}
+
+            // accumulate local sums
+            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+        else
+        {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = localMinMaxElement;
+        
+        // reduce local min/max to row min/max
+		if (Minimize) {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
+		} else {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
+			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
+			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
+			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
+		}
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            x[row] = sdata[threadIdx.x];
+    }
+}
+
+template <bool Minimize, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
+{
+	double __minMaxInitializer = -std::numeric_limits<double>::max();
+	if (Minimize) {
+		__minMaxInitializer = std::numeric_limits<double>::max();
+	}
+	const double minMaxInitializer = __minMaxInitializer;
+
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+
+    storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
+}
+
+template <bool Minimize>
+void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
+{
+    const uint_fast64_t rows_per_group = num_entries / num_rows;
+
+    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    
+    __storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+}
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    
+    __storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+
+    __storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+// NON-OPT
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    
+    __storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+
+    __storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
@ -0,0 +1,375 @@
+/*
+ * This is an extension of the original CUSP csr_vector.h SPMV implementation.
+ * It is based on the Code and incorporates changes as to cope with the details
+ * of the StoRM code.
+ * As this is mostly copy & paste, the original license still applies.
+ */
+
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <algorithm>
+
+#include <math_functions.h>
+
+#include <cusp/detail/device/spmv/csr_vector.h>
+
+#include "storm-cudaplugin-config.h"
+
+namespace cusp
+{
+namespace detail
+{
+namespace device
+{
+
+//////////////////////////////////////////////////////////////////////////////
+// CSR SpMV kernels based on a vector model (one warp per row)
+//////////////////////////////////////////////////////////////////////////////
+//
+// spmv_csr_vector_device
+//   Each row of the CSR matrix is assigned to a warp.  The warp computes
+//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
+//   the x vector, in parallel.  This division of work implies that 
+//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
+//   manner (but generally not aligned).  On GT200 these accesses are
+//   coalesced, unlike kernels based on the one-row-per-thread division of 
+//   work.  Since an entire 32-thread warp is assigned to each row, many 
+//   threads will remain idle when their row contains a small number 
+//   of elements.  This code relies on implicit synchronization among 
+//   threads in a warp.
+//
+// spmv_csr_vector_tex_device
+//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
+//   used for accessing the x vector.
+//  
+//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
+
+
+template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
+__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
+__global__ void
+storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
+{
+    __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
+    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local sum
+        float sum = 0;
+     
+        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+			}
+				
+            // accumulate local sums
+            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+			}
+        } else {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+			}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = sum;
+        
+        // reduce local sums to row sum
+        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
+        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
+        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
+        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
+        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            y[row] = sdata[threadIdx.x];
+    }
+}
+
+template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
+__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
+__global__ void
+storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
+{
+    __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
+    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local Min/Max
+        float localMinMaxElement = minMaxInitializer;
+
+        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+			}
+
+            // accumulate local sums
+            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+        else
+        {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = localMinMaxElement;
+        
+        // reduce local min/max to row min/max
+		if (Minimize) {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
+		} else {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
+			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
+			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
+			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
+		}
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            x[row] = sdata[threadIdx.x];
+    }
+}
+
+template <bool Minimize, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
+{
+	float __minMaxInitializer = -std::numeric_limits<float>::max();
+	if (Minimize) {
+		__minMaxInitializer = std::numeric_limits<float>::max();
+	}
+	const float minMaxInitializer = __minMaxInitializer;
+
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+
+    storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
+}
+
+template <bool Minimize>
+void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
+{
+    const uint_fast64_t rows_per_group = num_entries / num_rows;
+
+    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    
+    __storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+}
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    
+    __storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+
+    __storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+// NON-OPT
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    
+    __storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+
+    __storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/storm-cudaplugin-config.h.in
+++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in
@ -2,7 +2,6 @@
 * StoRM - Build-in Options
 *
 * This file is parsed by CMake during makefile generation
- * It contains information such as the base path to the test/example data
 */

 #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
@ -16,4 +15,7 @@
 #define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" 					// The short hash of the git commit this build is bases on
 #define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ 					// 0 iff there no files were modified in the checkout, 1 else

+// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit
+#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+
 #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
--- a/src/models/PseudoModel.cpp
+++ b/src/models/PseudoModel.cpp
@ -101,6 +101,9 @@ namespace storm {
 		template class ModelBasedPseudoModel<double>;
 		template class NonDeterministicMatrixBasedPseudoModel<double>;
 		template class DeterministicMatrixBasedPseudoModel<double>;
+		template class ModelBasedPseudoModel <float> ;
+		template class NonDeterministicMatrixBasedPseudoModel <float>;
+		template class DeterministicMatrixBasedPseudoModel <float>;
 		template class ModelBasedPseudoModel<int>;
 		template class NonDeterministicMatrixBasedPseudoModel<int>;
 		template class DeterministicMatrixBasedPseudoModel<int>;
--- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp
@ -76,7 +76,7 @@ namespace storm {
                }
                
                // Determine whether the method converged.
-                converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative);
+                converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *newX, precision, relative);
                
                // Update environment variables.
                std::swap(currentX, newX);
@ -140,5 +140,6 @@ namespace storm {
        
        // Explicitly instantiate the solver.
        template class NativeNondeterministicLinearEquationSolver<double>;
+		template class NativeNondeterministicLinearEquationSolver<float>;
    } // namespace solver
 } // namespace storm
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -42,9 +42,206 @@ namespace storm {
 		NondeterministicLinearEquationSolver<ValueType>* TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::clone() const {
 			return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this);
        }
+
+		template<>
+		void TopologicalValueIterationNondeterministicLinearEquationSolver<float>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<float> const& A, std::vector<float>& x, std::vector<float> const& b, std::vector<float>* multiplyResult, std::vector<float>* newX) const {
+			// For testing only
+			LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker WITH FLOAT! <<<");
+
+			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
+			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
+			storm::models::NonDeterministicMatrixBasedPseudoModel<float> pseudoModel(A, nondeterministicChoiceIndices);
+			storm::storage::StronglyConnectedComponentDecomposition<float> sccDecomposition(pseudoModel, false, false);
+
+			if (sccDecomposition.size() == 0) {
+				LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
+				throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
+			}
+
+			storm::storage::SparseMatrix<float> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
+			std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
+
+			// Calculate the optimal distribution of sccs
+			std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
+			LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
+
+			std::vector<float>* currentX = nullptr;
+			std::vector<float>* swap = nullptr;
+			size_t currentMaxLocalIterations = 0;
+			size_t localIterations = 0;
+			size_t globalIterations = 0;
+			bool converged = true;
+
+			// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
+			// solved after all SCCs it depends on have been solved.
+			int counter = 0;
+
+			for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
+				bool const useGpu = sccIndexIt->first;
+				storm::storage::StateBlock const& scc = sccIndexIt->second;
+
+				// Generate a sub matrix
+				storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
+				storm::storage::SparseMatrix<float> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
+				std::vector<float> sccSubB(sccSubmatrix.getRowCount());
+				storm::utility::vector::selectVectorValues<float>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
+				std::vector<float> sccSubX(sccSubmatrix.getColumnCount());
+				std::vector<float> sccSubXSwap(sccSubmatrix.getColumnCount());
+				std::vector<float> sccMultiplyResult(sccSubmatrix.getRowCount());
+
+				// Prepare the pointers for swapping in the calculation
+				currentX = &sccSubX;
+				swap = &sccSubXSwap;
+
+				storm::utility::vector::selectVectorValues<float>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
+				std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
+				sccSubNondeterministicChoiceIndices.at(0) = 0;
+
+				// Pre-process all dependent states
+				// Remove outgoing transitions and create the ChoiceIndices
+				uint_fast64_t innerIndex = 0;
+				uint_fast64_t outerIndex = 0;
+				for (uint_fast64_t state : scc) {
+					// Choice Indices
+					sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
+
+					for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
+						storm::storage::SparseMatrix<float>::const_rows row = A.getRow(rowGroupIt);
+						for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
+							if (!subMatrixIndices.get(rowIt->getColumn())) {
+								// This is an outgoing transition of a state in the SCC to a state not included in the SCC
+								// Subtracting Pr(tau) * x_other from b fixes that
+								sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
+							}
+						}
+						++innerIndex;
+					}
+					++outerIndex;
+				}
+
+				// For the current SCC, we need to perform value iteration until convergence.
+				if (useGpu) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+					if (!resetCudaDevice()) {
+						LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+						throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
+					}
+
+					//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
+					//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
+					//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
+
+					bool result = false;
+					localIterations = 0;
+					if (minimize) {
+						result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+					} else {
+						result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+					}
+					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
+
+					if (!result) {
+						converged = false;
+						LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
+						throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
+					} else {
+						converged = true;
+					}
+
+					// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+					// track of the maximum.
+					if (localIterations > currentMaxLocalIterations) {
+						currentMaxLocalIterations = localIterations;
+					}
+
+#else
+					LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
+					throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+#endif
+				} else {
+					localIterations = 0;
+					converged = false;
+					while (!converged && localIterations < this->maximalNumberOfIterations) {
+						// Compute x' = A*x + b.
+						sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
+						storm::utility::vector::addVectorsInPlace<float>(sccMultiplyResult, sccSubB);
+
+						//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
+						//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+
+						/*
+						Versus:
+						A.multiplyWithVector(*currentX, *multiplyResult);
+						storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
+						*/
+
+						// Reduce the vector x' by applying min/max for all non-deterministic choices.
+						if (minimize) {
+							storm::utility::vector::reduceVectorMin<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						} else {
+							storm::utility::vector::reduceVectorMax<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+						}
+
+						// Determine whether the method converged.
+						// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
+						// running time. In fact, it is faster. This has to be investigated.
+						// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
+						converged = storm::utility::vector::equalModuloPrecision<float>(*currentX, *swap, this->precision, this->relative);
+
+						// Update environment variables.
+						std::swap(currentX, swap);
+
+						++localIterations;
+						++globalIterations;
+					}
+					LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+				}
+
+
+				// The Result of this SCC has to be taken back into the main result vector
+				innerIndex = 0;
+				for (uint_fast64_t state : scc) {
+					x.at(state) = currentX->at(innerIndex);
+					++innerIndex;
+				}
+
+				// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
+				currentX = nullptr;
+				swap = nullptr;
+
+				// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+				// track of the maximum.
+				if (localIterations > currentMaxLocalIterations) {
+					currentMaxLocalIterations = localIterations;
+				}
+			}
+
+			// Check if the solver converged and issue a warning otherwise.
+			if (converged) {
+				LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
+			} else {
+				LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
+			}
+		}
        
        template<typename ValueType>
 		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
+			
+#ifndef GPU_USE_DOUBLE
+			std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
+			TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
+
+			storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
+			std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
+			std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
+
+			tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
+
+			for (size_t i = 0, size = new_x.size(); i < size; ++i) {
+				x.at(i) = new_x.at(i);
+			}
+#else
+			std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
 			// For testing only
 			LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<");

@ -224,6 +421,7 @@ namespace storm {
 			else {
 				LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
 			}
+#endif
        }

 		template<typename ValueType>
--- a/src/storage/SparseMatrix.cpp
+++ b/src/storage/SparseMatrix.cpp
@ -1000,6 +1000,10 @@ namespace storm {
        template class SparseMatrix<int>;
        template std::ostream& operator<<(std::ostream& out, SparseMatrix<int> const& matrix);
        
+		template class MatrixEntry < float > ;
+		template class SparseMatrixBuilder < float >;
+		template class SparseMatrix < float >;
+		template std::ostream& operator<<(std::ostream& out, SparseMatrix<float> const& matrix);
    } // namespace storage
 } // namespace storm

--- a/src/storage/SparseMatrix.h
+++ b/src/storage/SparseMatrix.h
@ -733,6 +733,23 @@ namespace storm {
 			* Returns a reference to the internal columnMapping vector
 			*/
 			std::vector<MatrixEntry<T>> const& __internal_getColumnsAndValues();
+
+			/*!
+			* Returns a copy of the matrix with the chosen internal data type
+			*/
+			template<typename NewValueType>
+			SparseMatrix<NewValueType> toValueType() const {
+				std::vector<MatrixEntry<NewValueType>> new_columnsAndValues;
+				std::vector<uint_fast64_t> new_rowIndications(rowIndications);
+				std::vector<uint_fast64_t> new_rowGroupIndices(rowGroupIndices);
+
+				new_columnsAndValues.resize(columnsAndValues.size());
+				for (size_t i = 0, size = columnsAndValues.size(); i < size; ++i) {
+					new_columnsAndValues.at(i) = MatrixEntry<NewValueType>(columnsAndValues.at(i).getColumn(), static_cast<NewValueType>(columnsAndValues.at(i).getValue()));
+				}
+
+				return SparseMatrix<NewValueType>(columnCount, std::move(new_rowIndications), std::move(new_columnsAndValues), std::move(new_rowGroupIndices));
+			}
        private:
            /*!
             * Creates a submatrix of the current matrix by keeping only row groups and columns in the given row group
--- a/src/storage/StronglyConnectedComponentDecomposition.cpp
+++ b/src/storage/StronglyConnectedComponentDecomposition.cpp
@ -223,5 +223,6 @@ namespace storm {

        // Explicitly instantiate the SCC decomposition.
        template class StronglyConnectedComponentDecomposition<double>;
+		template class StronglyConnectedComponentDecomposition<float>;
    } // namespace storage
 } // namespace storm
--- a/src/utility/vector.h
+++ b/src/utility/vector.h
@ -422,6 +422,20 @@ namespace storm {
                
                return subVector;
            }
+
+			/*!
+			* Converts the given vector to the given ValueType
+			*/
+			template<typename NewValueType, typename ValueType>
+			std::vector<NewValueType> toValueType(std::vector<ValueType> const& oldVector) {
+				std::vector<NewValueType> resultVector;
+				resultVector.resize(oldVector.size());
+				for (size_t i = 0, size = oldVector.size(); i < size; ++i) {
+					resultVector.at(i) = static_cast<NewValueType>(oldVector.at(i));
+				}
+
+				return resultVector;
+			}
        } // namespace vector
    } // namespace utility
 } // namespace storm
--- a/test/functional/solver/CudaPluginTest.cpp
+++ b/test/functional/solver/CudaPluginTest.cpp
@ -41,6 +41,38 @@ TEST(CudaPlugin, SpMV_4x4) {
 	ASSERT_EQ(b.at(3), 0);
 }

+TEST(CudaPlugin, SpMV_4x4_float) {
+	storm::storage::SparseMatrixBuilder<float> matrixBuilder(4, 4, 10);
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f));
+
+
+	storm::storage::SparseMatrix<float> matrix;
+	ASSERT_NO_THROW(matrix = matrixBuilder.build());
+
+	ASSERT_EQ(4, matrix.getRowCount());
+	ASSERT_EQ(4, matrix.getColumnCount());
+	ASSERT_EQ(10, matrix.getEntryCount());
+
+	std::vector<float> x({ 0.f, 4.f, 1.f, 1.f });
+	std::vector<float> b({ 0.f, 0.f, 0.f, 0.f });
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 3);
+	ASSERT_EQ(b.at(1), 25);
+	ASSERT_EQ(b.at(2), 16);
+	ASSERT_EQ(b.at(3), 0);
+}
+
 TEST(CudaPlugin, SpMV_VerySmall) {
 	storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2);
 	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
@ -62,6 +94,27 @@ TEST(CudaPlugin, SpMV_VerySmall) {
 	ASSERT_EQ(b.at(1), 16.0);
 }

+TEST(CudaPlugin, SpMV_VerySmall_float) {
+	storm::storage::SparseMatrixBuilder<float> matrixBuilder(2, 2, 2);
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
+
+	storm::storage::SparseMatrix<float> matrix;
+	ASSERT_NO_THROW(matrix = matrixBuilder.build());
+
+	ASSERT_EQ(2, matrix.getRowCount());
+	ASSERT_EQ(2, matrix.getColumnCount());
+	ASSERT_EQ(2, matrix.getEntryCount());
+
+	std::vector<float> x({ 4.0, 8.0 });
+	std::vector<float> b({ 0.0, 0.0 });
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 4.0);
+	ASSERT_EQ(b.at(1), 16.0);
+}
+
 TEST(CudaPlugin, AddVectorsInplace) {
 	std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
 	std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
@ -93,6 +146,37 @@ TEST(CudaPlugin, AddVectorsInplace) {
 	}
 }

+TEST(CudaPlugin, AddVectorsInplace_float) {
+	std::vector<float> vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+	std::vector<float> vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f };
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB));
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC));
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	for (size_t i = 0; i < vectorA_3.size(); ++i) {
+		float cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
+		float cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
+
+		ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
+		ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
+	}
+}
+
 TEST(CudaPlugin, ReduceGroupedVector) {
 	std::vector<double> groupedVector = {
 		0.0, -1000.0, 0.000004, // Group 0
@ -138,15 +222,60 @@ TEST(CudaPlugin, ReduceGroupedVector) {
 	}
 }

+TEST(CudaPlugin, ReduceGroupedVector_float) {
+	std::vector<float> groupedVector = {
+		0.0f, -1000.0f, 0.000004f, // Group 0
+		5.0f,					// Group 1
+		0.0f, 1.0f, 2.0f, 3.0f,		// Group 2
+		-1000.0f, -3.14f, -0.0002f,// Group 3 (neg only)
+		25.25f, 25.25f, 25.25f,	// Group 4
+		0.0f, 0.0f, 1.0f,			// Group 5
+		-0.000001f, 0.000001f		// Group 6
+	};
+	std::vector<uint_fast64_t> grouping = {
+		0, 3, 4, 8, 11, 14, 17, 19
+	};
+
+	std::vector<float> result_minimize = {
+		-1000.0f, // Group 0
+		5.0f,
+		0.0f,
+		-1000.0f,
+		25.25f,
+		0.0f,
+		-0.000001f
+	};
+	std::vector<float> result_maximize = {
+		0.000004f,
+		5.0f,
+		3.0f,
+		-0.0002f,
+		25.25f,
+		1.0f,
+		0.000001f
+	};
+
+	std::vector<float> result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+	std::vector<float> result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize));
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize));
+
+	for (size_t i = 0; i < result_minimize.size(); ++i) {
+		ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
+		ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
+	}
+}
+
 TEST(CudaPlugin, equalModuloPrecision) {
 	std::vector<double> x = {
-		123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L
+		123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12
 	};
 	std::vector<double> y1 = {
-		0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L
+		0.45, 0.8, 0.23, 0.012, 0.456789, -0.12
 	};
 	std::vector<double> y2 = {
-		0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L
+		0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12
 	};
 	std::vector<double> x2;
 	std::vector<double> x3;
@ -163,21 +292,63 @@ TEST(CudaPlugin, equalModuloPrecision) {
 		y4.push_back(1.0);
 	}

-	double maxElement1 = 0.0L;
-	double maxElement2 = 0.0L;
-	double maxElement3 = 0.0L;
-	double maxElement4 = 0.0L;
+	double maxElement1 = 0.0;
+	double maxElement2 = 0.0;
+	double maxElement3 = 0.0;
+	double maxElement4 = 0.0;
 	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1));
 	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2));

 	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3));
 	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4));

-	ASSERT_DOUBLE_EQ(4567890.0L, maxElement1);
-	ASSERT_DOUBLE_EQ(901.0L, maxElement2);
+	ASSERT_DOUBLE_EQ(4567890.0, maxElement1);
+	ASSERT_DOUBLE_EQ(901.0, maxElement2);
+
+	ASSERT_DOUBLE_EQ(998.0, maxElement3);
+	ASSERT_DOUBLE_EQ(1001.0, maxElement4);
+}
+
+TEST(CudaPlugin, equalModuloPrecision_float) {
+	std::vector<float> x = {
+		123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f
+	};
+	std::vector<float> y1 = {
+		0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f
+	};
+	std::vector<float> y2 = {
+		0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f
+	};
+	std::vector<float> x2;
+	std::vector<float> x3;
+	std::vector<float> y3;
+	std::vector<float> y4;
+	x2.reserve(1000);
+	x3.reserve(1000);
+	y3.reserve(1000);
+	y4.reserve(1000);
+	for (size_t i = 0; i < 1000; ++i) {
+		x2.push_back(static_cast<float>(i));
+		y3.push_back(1.0f);
+		x3.push_back(-(1000.0f - static_cast<float>(i)));
+		y4.push_back(1.0f);
+	}
+
+	float maxElement1 = 0.0f;
+	float maxElement2 = 0.0f;
+	float maxElement3 = 0.0f;
+	float maxElement4 = 0.0f;
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2));
+
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4));
+
+	ASSERT_DOUBLE_EQ(4567890.0f, maxElement1);
+	ASSERT_DOUBLE_EQ(901.0f, maxElement2);

-	ASSERT_DOUBLE_EQ(998.0L, maxElement3);
-	ASSERT_DOUBLE_EQ(1001.0L, maxElement4);
+	ASSERT_DOUBLE_EQ(998.0f, maxElement3);
+	ASSERT_DOUBLE_EQ(1001.0f, maxElement4);
 }

 #endif