All tests for CUDA based TopologicalValueIterationMdpPrctlModelChecker passing on Windows.

Former-commit-id: 68cafa6f84
11 years ago · ea2e616196
27 changed files with 2509 additions and 43 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -271,17 +271,110 @@ include_directories("${PROJECT_BINARY_DIR}/include")
 #############################################################
 if(ENABLE_CUDA)
 	# Test for type alignment
 	try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
 		${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/cuda/CMakeAlignmentCheck.cpp"
 		COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
 	)
 	if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/cuda/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
 	elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
 		message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
 	else()
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
 	endif()
 	# Test for Float 64bit Alignment
 	try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT
 		${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/cuda/CMakeFloatAlignmentCheck.cpp"
 		COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
 	)
 	if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT)
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/cuda/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
 	elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2)
 		message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.")
 		set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define")
 	elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3)
 		message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.")
 		set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef")
 	else()
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})")
 	endif()
 	#
 	# Make a version file containing the current version from git.
 	#
 	include(GetGitRevisionDescription)
 	git_describe_checkout(STORM_GIT_VERSION_STRING)
 	# Parse the git Tag into variables
 	string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}")
 	string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}")
 	string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}")
 	string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}")
 	string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}")
 	string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}")
 	if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$")
 		set(STORM_CUDAPLUGIN_VERSION_DIRTY 1)
 	else()
 		set(STORM_CUDAPLUGIN_VERSION_DIRTY 0)
 	endif()
 	message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})")
 	# Configure a header file to pass some of the CMake settings to the source code
 	configure_file (
 		"${PROJECT_SOURCE_DIR}/cuda/storm-cudaplugin-config.h.in"
 		"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
 	)
 	#create library
 	find_package(CUDA REQUIRED)
 	set(CUSP_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/resources/3rdparty/cusplibrary")
 	find_package(Cusp REQUIRED)
 	find_package(Thrust REQUIRED)
 	set(STORM_CUDA_LIB_NAME "storm-cuda")
 	file(GLOB_RECURSE STORM_CUDA_KERNEL_FILES ${PROJECT_SOURCE_DIR}/cuda/kernels/*.cu)
 	file(GLOB_RECURSE STORM_CUDA_HEADER_FILES ${PROJECT_SOURCE_DIR}/cuda/kernels/*.h)
 	source_group(kernels FILES ${STORM_CUDA_KERNEL_FILES} ${STORM_CUDA_HEADER_FILES})
 	include_directories(${PROJECT_SOURCE_DIR}/cuda/kernels/)
 	set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 	set(CUDA_NVCC_FLAGS "-arch=sm_13")
 	#set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 	set(CUDA_NVCC_FLAGS "-arch=sm_30")
 	#############################################################
 	##
 	##	CUSP
 	##
 	#############################################################
 	if(CUSP_FOUND)
 		include_directories(${CUSP_INCLUDE_DIR})
 		cuda_include_directories(${CUSP_INCLUDE_DIR})
 		message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
 	else()
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
 	endif()
 	#############################################################
 	##
 	##	Thrust
 	##
 	#############################################################
 	if(THRUST_FOUND)
 		include_directories(${THRUST_INCLUDE_DIR})
 		cuda_include_directories(${THRUST_INCLUDE_DIR})
 		message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
 	else()
 		message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
 	endif()
 	include_directories(${CUDA_INCLUDE_DIRS})
 	include_directories(${ADDITIONAL_INCLUDE_DIRS})
 	cuda_add_library(${STORM_CUDA_LIB_NAME}
 	    ${STORM_CUDA_KERNEL_FILES}
 	    ${STORM_CUDA_KERNEL_FILES} ${STORM_CUDA_HEADER_FILES}
 	)
 endif()
@ -485,6 +578,8 @@ if (ENABLE_CUDA)
 	target_link_libraries(storm ${STORM_CUDA_LIB_NAME})
    target_link_libraries(storm-functional-tests ${STORM_CUDA_LIB_NAME})
    target_link_libraries(storm-performance-tests ${STORM_CUDA_LIB_NAME})
 	include_directories("${PROJECT_SOURCE_DIR}/cuda/kernels/")
 endif(ENABLE_CUDA)
 #############################################################
--- a/cuda/CMakeAlignmentCheck.cpp
+++ b/cuda/CMakeAlignmentCheck.cpp
@ -0,0 +1,64 @@
 /*
 * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
 */
 #include <cstdint>
 #include <utility>
 #include <vector>
 #define CONTAINER_SIZE 100ul
 template <typename IndexType, typename ValueType>
 int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
 	std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
 	for (size_t i = 0; i < containerSize; ++i) {
 		myVector->push_back(std::make_pair(firstValue, secondValue));
 	}
 	size_t myVectorSize = myVector->size();
 	IndexType* firstStart = &(myVector->at(0).first);
 	IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
 	ValueType* secondStart = &(myVector->at(0).second);
 	ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
 	size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
 	size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
 	size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
 	size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
 	delete myVector;
 	myVector = nullptr;
 	if (myVectorSize != containerSize) {
 		return -2;
 	}
 	// Check for alignment:
 	// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
 	if (sizeof(IndexType) != sizeof(ValueType)) {
 		return -3;
 	}
 	if (startOffset != sizeof(IndexType)) {
 		return -4;
 	}
 	if (endOffset != sizeof(IndexType)) {
 		return -5;
 	}
 	if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
 		return -6;
 	}
 	if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
 		return -7;
 	}
 	return 0;
 }
 int main(int argc, char* argv[]) {
 	int result = 0;
 	result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
 	if (result != 0) {
 		return result;
 	}
 	return 0;
 }
--- a/cuda/CMakeFloatAlignmentCheck.cpp
+++ b/cuda/CMakeFloatAlignmentCheck.cpp
@ -0,0 +1,31 @@
 /*
 * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries
 */
 #include <cstdint>
 #include <utility>
 #include <vector>
 #define CONTAINER_SIZE 100ul
 int main(int argc, char* argv[]) {
 	int result = 0;
 	std::vector<std::pair<uint_fast64_t, float>> myVector;
 	for (size_t i = 0; i < CONTAINER_SIZE; ++i) {
 		myVector.push_back(std::make_pair(i, 42.12345f * i));
 	}
 	char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first));
 	char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first));
 	ptrdiff_t uintDiff = secondUintPointer - firstUintPointer;
 	if (uintDiff == (2 * sizeof(uint_fast64_t))) {
 		result = 2;
 	} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) {
 		result = 3;
 	} else {
 		result = -5;
 	}
 	return result;
 }
--- a/cuda/kernels/allCudaKernels.h
+++ b/cuda/kernels/allCudaKernels.h
@ -0,0 +1,6 @@
 #include "utility.h"
 #include "bandWidth.h"
 #include "basicAdd.h"
 #include "kernelSwitchTest.h"
 #include "basicValueIteration.h"
 #include "version.h"
--- a/cuda/kernels/bandWidth.cu
+++ b/cuda/kernels/bandWidth.cu
--- a/cuda/kernels/bandWidth.h
+++ b/cuda/kernels/bandWidth.h
--- a/cuda/kernels/basicAdd.cu
+++ b/cuda/kernels/basicAdd.cu
@ -0,0 +1,286 @@
 #include <cuda.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <chrono>
 #include <iostream>
 __global__ void cuda_kernel_basicAdd(int a, int b, int *c) { 
 	*c = a + b; 
 }
 __global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
 	// Fused Multiply Add:
 	// A * B + C => D
 	/*
     *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd
     *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche
     *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente
     *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des
     *Threads, der die Funktion gerade ausführt.
    */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 	if (i < N) {
 		D[i] = A[i] * B[i] + C[i];
 	}
 }
 __global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) {
 	// Fused Multiply Add:
 	// A * B + C => D
 	// Layout:
 	// A B C D A B C D A B C D
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 	if ((i*M) < N) {
 		for (int j = i*M; j < i*M + M; ++j) {
 			A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2];
 		}
 	}
 }
 extern "C" int cuda_basicAdd(int a, int b) {
 	int c = 0;
 	int *dev_c;
 	cudaMalloc((void**)&dev_c, sizeof(int));
 	cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c);
 	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
 	//printf("%d + %d + 42 is %d\n", a, b, c);
 	cudaFree(dev_c);
 	return c;
 }
 void cpp_cuda_bandwidthTest(int entryCount, int N) {
 	// Size of the Arrays
 	size_t arraySize = entryCount * sizeof(int);
 	int* deviceIntArray;
 	int* hostIntArray = new int[arraySize];
 	// Allocate space on the device
 	auto start_time = std::chrono::high_resolution_clock::now();
 	for (int i = 0; i < N; ++i) {
 		if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
 			std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl;
 			delete[] hostIntArray;
 			return;
 		}
 		// Free memory on device
 		if (cudaFree(deviceIntArray) != cudaSuccess) {
 			std::cout << "Error in cudaFree!" << std::endl;
 			delete[] hostIntArray;
 			return;
 		}
 	}
 	auto end_time = std::chrono::high_resolution_clock::now();
 	auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
 	double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
 	std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl;
 	if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
 		std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl;
 		delete[] hostIntArray;
 		return;
 	}
 	// Prepare data
 	for (int i = 0; i < N; ++i) {
 		hostIntArray[i] = i * 333 + 123;
 	}
 	// Copy data TO device
 	start_time = std::chrono::high_resolution_clock::now();
 	for (int i = 0; i < N; ++i) {
 		if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
 			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl;
 			// Free memory on device
 			if (cudaFree(deviceIntArray) != cudaSuccess) {
 				std::cout << "Error in cudaFree!" << std::endl;
 			}
 			delete[] hostIntArray;
 			return;
 		}
 	}
 	end_time = std::chrono::high_resolution_clock::now();
 	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
 	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
 	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl;
 	// Copy data FROM device
 	start_time = std::chrono::high_resolution_clock::now();
 	for (int i = 0; i < N; ++i) {
 		if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
 			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl;
 			// Free memory on device
 			if (cudaFree(deviceIntArray) != cudaSuccess) {
 				std::cout << "Error in cudaFree!" << std::endl;
 			}
 			delete[] hostIntArray;
 			return;
 		}
 	}
 	end_time = std::chrono::high_resolution_clock::now();
 	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
 	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
 	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl;
 	// Free memory on device
 	if (cudaFree(deviceIntArray) != cudaSuccess) {
 		std::cout << "Error in cudaFree!" << std::endl;
 	}
 	delete[] hostIntArray;
 }
 extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
 	// Size of the Arrays
 	size_t arraySize = N * sizeof(int);
 	int* deviceIntArrayA;
 	int* deviceIntArrayB;
 	int* deviceIntArrayC;
 	int* deviceIntArrayD;
 	// Allocate space on the device
 	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
 		printf("Error in cudaMalloc1!\n");
 		return;
 	}
 	if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) {
 		printf("Error in cudaMalloc2!\n");
 		cudaFree(deviceIntArrayA);
 		return;
 	}
 	if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) {
 		printf("Error in cudaMalloc3!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		return;
 	}
 	if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) {
 		printf("Error in cudaMalloc4!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		cudaFree(deviceIntArrayC);
 		return;
 	}
 	// Copy data TO device
 	if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		cudaFree(deviceIntArrayC);
 		cudaFree(deviceIntArrayD);
 		return;
 	}
 	if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		cudaFree(deviceIntArrayC);
 		cudaFree(deviceIntArrayD);
 		return;
 	}
 	if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		cudaFree(deviceIntArrayC);
 		cudaFree(deviceIntArrayD);
 		return;
 	}
    // Festlegung der Threads pro Block
    int threadsPerBlock = 512;
    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 	// Run kernel
 	cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N);
 	// Copy data FROM device
 	if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		cudaFree(deviceIntArrayB);
 		cudaFree(deviceIntArrayC);
 		cudaFree(deviceIntArrayD);
 		return;
 	}
 	// Free memory on device
 	cudaFree(deviceIntArrayA);
 	cudaFree(deviceIntArrayB);
 	cudaFree(deviceIntArrayC);
 	cudaFree(deviceIntArrayD);
 }
 extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) {
 	// Size of the Arrays
 	size_t arraySize = N * sizeof(int) * 4;
 	int* deviceIntArrayA;
 	// Allocate space on the device
 	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
 		printf("Error in cudaMalloc1!\n");
 		return;
 	}
 #define ONFAILFREE0() do { } while(0)
 #define ONFAILFREE1(a) do { cudaFree(a); } while(0)
 #define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0)
 #define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0)
 #define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0)
 #define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0)
 	// Copy data TO device
 	CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice);
 	/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		return;
 	}*/
    // Festlegung der Threads pro Block
    int threadsPerBlock = 512;
    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 	// Run kernel
 	cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M);
 	// Copy data FROM device
 	if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
 		printf("Error in cudaMemcpy!\n");
 		cudaFree(deviceIntArrayA);
 		return;
 	}
 	// Free memory on device
 	if (cudaFree(deviceIntArrayA) != cudaSuccess) {
 		printf("Error in cudaFree!\n");
 		return;
 	}
 }
 extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
 	for (int i = 0; i < N; ++i) {
 		D[i] = A[i] * B[i] + C[i];
 	}
 }
 extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) {
 	for (int i = 0; i < N; i += 4) {
 		A[i+3] = A[i] * A[i+1] + A[i+2];
 	}
 }
--- a/cuda/kernels/basicAdd.h
+++ b/cuda/kernels/basicAdd.h
@ -0,0 +1,9 @@
 extern "C" int cuda_basicAdd(int a, int b);
 extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M);
 extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N);
 extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N);
 extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N);
 void cpp_cuda_bandwidthTest(int entryCount, int N);
--- a/cuda/kernels/basicValueIteration.cu
+++ b/cuda/kernels/basicValueIteration.cu
@ -0,0 +1,879 @@
 #include "basicValueIteration.h"
 #define CUSP_USE_TEXTURE_MEMORY
 #include <iostream>
 #include <chrono>
 #include <cuda_runtime.h>
 #include "cusparse_v2.h"
 #include "utility.h"
 #include "cuspExtension.h"
 #include <thrust/transform.h>
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include "storm-cudaplugin-config.h"
 #ifdef DEBUG
 #define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync  = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize();	if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
 #else
 #define CUDA_CHECK_ALL_ERRORS() do {} while (false)
 #endif
 template<typename T, bool Relative>
 struct equalModuloPrecision : public thrust::binary_function<T,T,T>
 {
 __host__ __device__ T operator()(const T &x, const T &y) const
 {
    if (Relative) {
 		if (y == 0) {
 			return ((x >= 0) ? (x) : (-x));
 		}
 		const T result = (x - y) / y;
 		return ((result >= 0) ? (result) : (-result));
    } else {
        const T result = (x - y);
 		return ((result >= 0) ? (result) : (-result));
    }
 }
 };
 template<typename IndexType, typename ValueType>
 void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
 	indexVector.reserve(inputVector.size());
 	valueVector.reserve(inputVector.size());
 	for (size_t i = 0; i < inputVector.size(); ++i) {
 		indexVector.push_back(inputVector.at(i).first);
 		valueVector.push_back(inputVector.at(i).second);
 	}
 }
 // TEMPLATE VERSION
 template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
 bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	//std::vector<IndexType> matrixColumnIndices;
 	//std::vector<ValueType> matrixValues;
 	//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
 	bool errorOccured = false;
 	IndexType* device_matrixRowIndices = nullptr;
 	ValueType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 	IndexType* device_nondeterministicChoiceIndices = nullptr;
 #ifdef DEBUG
 	std::cout.sync_with_stdio(true);
 	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
 	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
 	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
 	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
 #endif
 	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
 	const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
 	const IndexType matrixNnzCount = columnIndicesAndValues.size();
 	cudaError_t cudaMallocResult;
 	bool converged = false;
 	iterationCount = 0;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 #define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
 #else
 #define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
 #endif
 	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
 		CUDA_CHECK_ALL_ERRORS();
 		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
 		if (cudaMallocResult != cudaSuccess) {
 			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 			errorOccured = true;
 			goto cleanup;
 		}
 	} else {
 		CUDA_CHECK_ALL_ERRORS();
 		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 		if (cudaMallocResult != cudaSuccess) {
 			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 			errorOccured = true;
 			goto cleanup;
 		}
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1));
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished allocating memory." << std::endl;
 #endif
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	// Copy all data as floats are expanded to 64bits :/
 	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
 		CUDA_CHECK_ALL_ERRORS();
 		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
 		if (cudaCopyResult != cudaSuccess) {
 			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 			errorOccured = true;
 			goto cleanup;
 		}
 	} else {
 		CUDA_CHECK_ALL_ERRORS();
 		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 		if (cudaCopyResult != cudaSuccess) {
 			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 			errorOccured = true;
 			goto cleanup;
 		}
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	// Preset the xSwap to zeros...
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	// Preset the multiplyResult to zeros...
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
 #endif
 	// Data is on device, start Kernel
 	while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
 		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();
 		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
 		thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
 		// Transform: Add multiplyResult + b inplace to multiplyResult
 		thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
 		CUDA_CHECK_ALL_ERRORS();
 		// Reduce: Reduce multiplyResult to a new x vector
 		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();
 		// Check for convergence
 		// Transform: x = abs(x - xSwap)/ xSwap
 		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
 		thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
 		thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
 		thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
 		CUDA_CHECK_ALL_ERRORS();
 		// Reduce: get Max over x and check for res < Precision
 		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
 		CUDA_CHECK_ALL_ERRORS();
 		converged = (maxX < precision);
 		++iterationCount;
 		// Swap pointers, device_x always contains the most current result
 		std::swap(device_x, device_xSwap);
 	}
 	if (!converged && (iterationCount == maxIterationCount)) {
 		iterationCount = 0;
 		errorOccured = true;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished kernel execution." << std::endl;
 	std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
 #endif
 	// Get x back from the device
 	cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
 		errorOccured = true;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished copying result data." << std::endl;
 #endif
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_matrixRowIndices = nullptr;
 	}
 	if (device_matrixColIndicesAndValues != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_matrixColIndicesAndValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_x = nullptr;
 	}
 	if (device_xSwap != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_xSwap);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_xSwap = nullptr;
 	}
 	if (device_b != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_b);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_b = nullptr;
 	}
 	if (device_multiplyResult != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_multiplyResult = nullptr;
 	}
 	if (device_nondeterministicChoiceIndices != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl;
 			errorOccured = true;
 		}
 		device_nondeterministicChoiceIndices = nullptr;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished cleanup." << std::endl;
 #endif
 	return !errorOccured;
 }
 template <typename IndexType, typename ValueType>
 void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
 	IndexType* device_matrixRowIndices = nullptr;
 	ValueType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 #ifdef DEBUG
 	std::cout.sync_with_stdio(true);
 	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
 	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
 	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
 	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
 #endif
 	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
 	const IndexType matrixNnzCount = columnIndicesAndValues.size();
 	cudaError_t cudaMallocResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 #else
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 #endif
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished allocating memory." << std::endl;
 #endif
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 #else
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 #endif
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	// Preset the multiplyResult to zeros...
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
 #endif
 	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
 	CUDA_CHECK_ALL_ERRORS();
 #ifdef DEBUG
 	std::cout << "(DLL) Finished kernel execution." << std::endl;
 #endif
 	// Get result back from the device
 	cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished copying result data." << std::endl;
 #endif
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_matrixRowIndices = nullptr;
 	}
 	if (device_matrixColIndicesAndValues != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_matrixColIndicesAndValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_x = nullptr;
 	}
 	if (device_multiplyResult != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_multiplyResult = nullptr;
 	}
 #ifdef DEBUG
 	std::cout << "(DLL) Finished cleanup." << std::endl;
 #endif
 }
 template <typename ValueType>
 void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
 	ValueType* device_a = nullptr;
 	ValueType* device_b = nullptr;
 	const size_t vectorSize = std::max(a.size(), b.size());
 	cudaError_t cudaMallocResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	do {
 		// Transform: Add multiplyResult + b inplace to multiplyResult
 		thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
 		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
 		thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
 		CUDA_CHECK_ALL_ERRORS();
 	} while (false);
 	// Get result back from the device
 	cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_a != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_a);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_a = nullptr;
 	}
 	if (device_b != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_b);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_b = nullptr;
 	}
 }
 template <typename IndexType, typename ValueType, bool Minimize>
 void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
 	ValueType* device_groupedVector = nullptr;
 	IndexType* device_grouping = nullptr;
 	ValueType* device_target = nullptr;
 	const size_t groupedSize = groupedVector.size();
 	const size_t groupingSize = grouping.size();
 	const size_t targetSize = targetVector.size();
 	cudaError_t cudaMallocResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	do {
 		// Reduce: Reduce multiplyResult to a new x vector
 		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
 		CUDA_CHECK_ALL_ERRORS();
 	} while (false);
 	// Get result back from the device
 	cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_groupedVector != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_groupedVector = nullptr;
 	}
 	if (device_grouping != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_grouping);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_grouping = nullptr;
 	}
 	if (device_target != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_target);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_target = nullptr;
 	}
 }
 template <typename ValueType, bool Relative>
 void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
 	ValueType* device_x = nullptr;
 	ValueType* device_y = nullptr;
 	const size_t vectorSize = x.size();
 	cudaError_t cudaMallocResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	CUDA_CHECK_ALL_ERRORS();
 	cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	do {
 		// Transform: x = abs(x - xSwap)/ xSwap
 		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
 		thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
 		thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
 		CUDA_CHECK_ALL_ERRORS();
 		// Reduce: get Max over x and check for res < Precision
 		maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
 		CUDA_CHECK_ALL_ERRORS();
 	} while (false);
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_x = nullptr;
 	}
 	if (device_y != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_y);
 		if (cudaFreeResult != cudaSuccess) {
 			std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
 		}
 		device_y = nullptr;
 	}
 }
 /*
 * Declare and implement all exported functions for these Kernels here
 *
 */
 void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
 	basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
 }
 void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
 	basicValueIteration_addVectorsInplace<double>(a, b);
 }
 void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
 	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
 }
 void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
 	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
 }
 void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
 	basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
 }
 void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
 	basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
 }
 // Float
 void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
 	basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
 }
 void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
 	basicValueIteration_addVectorsInplace<float>(a, b);
 }
 void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
 	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
 }
 void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
 	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
 }
 void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
 	basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
 }
 void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
 	basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
 }
 bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	if (relativePrecisionCheck) {
 		return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	} else {
 		return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	}
 }
 bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	if (relativePrecisionCheck) {
 		return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	} else {
 		return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	}
 }
 bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	if (relativePrecisionCheck) {
 		return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	} else {
 		return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	}
 }
 bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 	if (relativePrecisionCheck) {
 		return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	} else {
 		return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 	}
 }
 size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
 	size_t const valueTypeSize = sizeof(double);
 	size_t const indexTypeSize = sizeof(uint_fast64_t);
 	/*
 	IndexType* device_matrixRowIndices = nullptr;
 	IndexType* device_matrixColIndices = nullptr;
 	ValueType* device_matrixValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 	IndexType* device_nondeterministicChoiceIndices = nullptr;
 	*/
 	// Row Indices, Column Indices, Values, Choice Indices
 	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
 	// Vectors x, xSwap, b, multiplyResult
 	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
 	return (matrixDataSize + vectorSizes);
 }
 size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
 	size_t const valueTypeSize = sizeof(float);
 	size_t const indexTypeSize = sizeof(uint_fast64_t);
 	/*
 	IndexType* device_matrixRowIndices = nullptr;
 	IndexType* device_matrixColIndices = nullptr;
 	ValueType* device_matrixValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 	IndexType* device_nondeterministicChoiceIndices = nullptr;
 	*/
 	// Row Indices, Column Indices, Values, Choice Indices
 	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
 	// Vectors x, xSwap, b, multiplyResult
 	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
 	return (matrixDataSize + vectorSizes);
 }
--- a/cuda/kernels/basicValueIteration.h
+++ b/cuda/kernels/basicValueIteration.h
@ -0,0 +1,119 @@
 #ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
 #define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
 #include <cstdint>
 #include <vector>
 #include <utility>
 // Library exports
 #include "cudaForStorm.h"
 /* Helper declaration to cope with new internal format */
 #ifndef STORM_STORAGE_SPARSEMATRIX_H_
 namespace storm {
 	namespace storage {
 		template<typename IndexType, typename ValueType>
 		class MatrixEntry {
 		public:
 			typedef IndexType index_type;
 			typedef ValueType value_type;
 			/*!
 			* Constructs a matrix entry with the given column and value.
 			*
 			* @param column The column of the matrix entry.
 			* @param value The value of the matrix entry.
 			*/
 			MatrixEntry(index_type column, value_type value);
 			/*!
 			* Move-constructs the matrix entry fro the given column-value pair.
 			*
 			* @param pair The column-value pair from which to move-construct the matrix entry.
 			*/
 			MatrixEntry(std::pair<index_type, value_type>&& pair);
 			MatrixEntry();
 			MatrixEntry(MatrixEntry const& other);
 			MatrixEntry& operator=(MatrixEntry const& other);
 #ifndef WINDOWS
 			MatrixEntry(MatrixEntry&& other);
 			MatrixEntry& operator=(MatrixEntry&& other);
 #endif
 			/*!
 			* Retrieves the column of the matrix entry.
 			*
 			* @return The column of the matrix entry.
 			*/
 			index_type const& getColumn() const;
 			/*!
 			* Sets the column of the current entry.
 			*
 			* @param column The column to set for this entry.
 			*/
 			void setColumn(index_type const& column);
 			/*!
 			* Retrieves the value of the matrix entry.
 			*
 			* @return The value of the matrix entry.
 			*/
 			value_type const& getValue() const;
 			/*!
 			* Sets the value of the entry in the matrix.
 			*
 			* @param value The value that is to be set for this entry.
 			*/
 			void setValue(value_type const& value);
 			/*!
 			* Retrieves a pair of column and value that characterizes this entry.
 			*
 			* @return A column-value pair that characterizes this entry.
 			*/
 			std::pair<index_type, value_type> const& getColumnValuePair() const;
 			/*!
 			* Multiplies the entry with the given factor and returns the result.
 			*
 			* @param factor The factor with which to multiply the entry.
 			*/
 			MatrixEntry operator*(value_type factor) const;
 			template<typename IndexTypePrime, typename ValueTypePrime>
 			friend std::ostream& operator<<(std::ostream& out, MatrixEntry<IndexTypePrime, ValueTypePrime> const& entry);
 		private:
 			// The actual matrix entry.
 			std::pair<index_type, value_type> entry;
 		};
 	}
 }
 #endif
 size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
 bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
 bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
 size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
 bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
 bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
 void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
 void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
 void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
 void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
 void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
 void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
 void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b);
 void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b);
 void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
 void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
 void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
 void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
 #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/cuda/kernels/cudaForStorm.h
+++ b/cuda/kernels/cudaForStorm.h
@ -0,0 +1,19 @@
 #ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_
 #define STORM_CUDAFORSTORM_CUDAFORSTORM_H_
 /*
 * List of exported functions in this library
 */
 // TopologicalValueIteration
 #include "basicValueIteration.h"
 // Utility Functions
 #include "utility.h"
 // Version Information
 #include "version.h"
 #endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_
--- a/cuda/kernels/cuspExtension.h
+++ b/cuda/kernels/cuspExtension.h
@ -0,0 +1,49 @@
 #pragma once
 #include "cuspExtensionFloat.h"
 #include "cuspExtensionDouble.h"
 namespace cusp {
 namespace detail {
 namespace device {
 template <typename ValueType>
 void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) {
 	//
 	throw;
 }
 template <>
 void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) {
 	storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 template <>
 void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) {
 	storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 template <bool Minimize, typename ValueType>
 void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) {
 	//
 	throw;
 }
 template <>
 void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
 	storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
 template <>
 void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
 	storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
 template <>
 void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
 	storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
 template <>
 void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
 	storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
 }
 } // end namespace device
 } // end namespace detail
 } // end namespace cusp
--- a/cuda/kernels/cuspExtensionDouble.h
+++ b/cuda/kernels/cuspExtensionDouble.h
@ -0,0 +1,361 @@
 /*
 * This is an extension of the original CUSP csr_vector.h SPMV implementation.
 * It is based on the Code and incorporates changes as to cope with the details
 * of the StoRM code.
 * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm
 */
 /*
 *  Copyright 2008-2009 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 #pragma once
 #include <limits>
 #include <cstdint>
 #include <algorithm>
 #include <math_functions.h>
 #include <cusp/detail/device/spmv/csr_vector.h>
 namespace cusp
 {
 namespace detail
 {
 namespace device
 {
 //////////////////////////////////////////////////////////////////////////////
 // CSR SpMV kernels based on a vector model (one warp per row)
 //////////////////////////////////////////////////////////////////////////////
 //
 // spmv_csr_vector_device
 //   Each row of the CSR matrix is assigned to a warp.  The warp computes
 //   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
 //   the x vector, in parallel.  This division of work implies that 
 //   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
 //   manner (but generally not aligned).  On GT200 these accesses are
 //   coalesced, unlike kernels based on the one-row-per-thread division of 
 //   work.  Since an entire 32-thread warp is assigned to each row, many 
 //   threads will remain idle when their row contains a small number 
 //   of elements.  This code relies on implicit synchronization among 
 //   threads in a warp.
 //
 // spmv_csr_vector_tex_device
 //   Same as spmv_csr_vector_tex_device, except that the texture cache is 
 //   used for accessing the x vector.
 //  
 //  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
 template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
 __global__ void
 storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
 {
    __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
    {
        // use two threads to fetch Ap[row] and Ap[row+1]
        // this is considerably faster than the straightforward version
        if(thread_lane < 2)
            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
        // initialize local sum
        double sum = 0;
        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
        {
            // ensure aligned memory access to Aj and Ax
            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
            // accumulate local sums
            if(jj >= row_start && jj < row_end) {
 				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 			}
            // accumulate local sums
            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
 			}
        } else {
            // accumulate local sums
            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
 			}
        }
        // store local sum in shared memory
        sdata[threadIdx.x] = sum;
        // reduce local sums to row sum
        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
        // first thread writes the result
        if (thread_lane == 0)
            y[row] = sdata[threadIdx.x];
    }
 }
 template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
 __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
 __global__ void
 storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
 {
    __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
    {
        // use two threads to fetch Ap[row] and Ap[row+1]
        // this is considerably faster than the straightforward version
        if(thread_lane < 2)
            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
        // initialize local Min/Max
        double localMinMaxElement = minMaxInitializer;
        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
        {
            // ensure aligned memory access to Aj and Ax
            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
            // accumulate local sums
            if(jj >= row_start && jj < row_end) {
 				if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
 			}
            // accumulate local sums
            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
                if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
        }
        else
        {
            // accumulate local sums
            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
                if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
        }
        // store local sum in shared memory
        sdata[threadIdx.x] = localMinMaxElement;
        // reduce local min/max to row min/max
 		if (Minimize) {
 			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
 			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
 		} else {
 			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
 			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
 			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
 			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
 			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
 			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
 		}
        // first thread writes the result
        if (thread_lane == 0)
            x[row] = sdata[threadIdx.x];
    }
 }
 template <bool Minimize, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
 {
 	double __minMaxInitializer = -std::numeric_limits<double>::max();
 	if (Minimize) {
 		__minMaxInitializer = std::numeric_limits<double>::max();
 	}
 	const double minMaxInitializer = __minMaxInitializer;
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
 }
 template <bool Minimize>
 void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
 {
    const uint_fast64_t rows_per_group = num_entries / num_rows;
    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    __storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
 }
 template <bool UseCache, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    if (UseCache)
        bind_x(x);
    storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
    if (UseCache)
        unbind_x(x);
 }
 void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    __storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    __storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 // NON-OPT
 template <bool UseCache, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    if (UseCache)
        bind_x(x);
    spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
    if (UseCache)
        unbind_x(x);
 }
 void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 } // end namespace device
 } // end namespace detail
 } // end namespace cusp
--- a/cuda/kernels/cuspExtensionFloat.h
+++ b/cuda/kernels/cuspExtensionFloat.h
@ -0,0 +1,375 @@
 /*
 * This is an extension of the original CUSP csr_vector.h SPMV implementation.
 * It is based on the Code and incorporates changes as to cope with the details
 * of the StoRM code.
 * As this is mostly copy & paste, the original license still applies.
 */
 /*
 *  Copyright 2008-2009 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 #pragma once
 #include <limits>
 #include <cstdint>
 #include <algorithm>
 #include <math_functions.h>
 #include <cusp/detail/device/spmv/csr_vector.h>
 #include "storm-cudaplugin-config.h"
 namespace cusp
 {
 namespace detail
 {
 namespace device
 {
 //////////////////////////////////////////////////////////////////////////////
 // CSR SpMV kernels based on a vector model (one warp per row)
 //////////////////////////////////////////////////////////////////////////////
 //
 // spmv_csr_vector_device
 //   Each row of the CSR matrix is assigned to a warp.  The warp computes
 //   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
 //   the x vector, in parallel.  This division of work implies that 
 //   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
 //   manner (but generally not aligned).  On GT200 these accesses are
 //   coalesced, unlike kernels based on the one-row-per-thread division of 
 //   work.  Since an entire 32-thread warp is assigned to each row, many 
 //   threads will remain idle when their row contains a small number 
 //   of elements.  This code relies on implicit synchronization among 
 //   threads in a warp.
 //
 // spmv_csr_vector_tex_device
 //   Same as spmv_csr_vector_tex_device, except that the texture cache is 
 //   used for accessing the x vector.
 //  
 //  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
 template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
 __global__ void
 storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
 {
    __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
    {
        // use two threads to fetch Ap[row] and Ap[row+1]
        // this is considerably faster than the straightforward version
        if(thread_lane < 2)
            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
        // initialize local sum
        float sum = 0;
        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
        {
            // ensure aligned memory access to Aj and Ax
            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
            // accumulate local sums
            if(jj >= row_start && jj < row_end) {
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
 #else
 				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
 #endif
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 			}
            // accumulate local sums
            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
 #else
 				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
 #endif
 			}
        } else {
            // accumulate local sums
            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
 #ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
 #else
 				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
 #endif
 			}
        }
        // store local sum in shared memory
        sdata[threadIdx.x] = sum;
        // reduce local sums to row sum
        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
        // first thread writes the result
        if (thread_lane == 0)
            y[row] = sdata[threadIdx.x];
    }
 }
 template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
 __launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
 __global__ void
 storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
 {
    __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
    {
        // use two threads to fetch Ap[row] and Ap[row+1]
        // this is considerably faster than the straightforward version
        if(thread_lane < 2)
            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
        // initialize local Min/Max
        float localMinMaxElement = minMaxInitializer;
        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
        {
            // ensure aligned memory access to Aj and Ax
            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
            // accumulate local sums
            if(jj >= row_start && jj < row_end) {
 				if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
 			}
            // accumulate local sums
            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
                if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
        }
        else
        {
            // accumulate local sums
            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
                if(Minimize) {
 					localMinMaxElement = min(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
 				} else {
 					localMinMaxElement = max(localMinMaxElement, y[jj]);
 					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
 				}
        }
        // store local sum in shared memory
        sdata[threadIdx.x] = localMinMaxElement;
        // reduce local min/max to row min/max
 		if (Minimize) {
 			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
 			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
 		} else {
 			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
 			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
 			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
 			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
 			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
 			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
 			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
 			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
 			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
 			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
 		}
        // first thread writes the result
        if (thread_lane == 0)
            x[row] = sdata[threadIdx.x];
    }
 }
 template <bool Minimize, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
 {
 	float __minMaxInitializer = -std::numeric_limits<float>::max();
 	if (Minimize) {
 		__minMaxInitializer = std::numeric_limits<float>::max();
 	}
 	const float minMaxInitializer = __minMaxInitializer;
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
 }
 template <bool Minimize>
 void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
 {
    const uint_fast64_t rows_per_group = num_entries / num_rows;
    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
    __storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
 }
 template <bool UseCache, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    if (UseCache)
        bind_x(x);
    storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
    if (UseCache)
        unbind_x(x);
 }
 void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    __storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
    __storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
 }
 // NON-OPT
 template <bool UseCache, unsigned int THREADS_PER_VECTOR>
 void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    if (UseCache)
        bind_x(x);
    spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
    if (UseCache)
        unbind_x(x);
 }
 void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
 {
    const uint_fast64_t nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 } // end namespace device
 } // end namespace detail
 } // end namespace cusp
--- a/cuda/kernels/kernelSwitchTest.cu
+++ b/cuda/kernels/kernelSwitchTest.cu
@ -0,0 +1,39 @@
 #include <iostream>
 #include <chrono>
 __global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) {
 	*B = *A;
 }
 void kernelSwitchTest(size_t N) {
 	int* deviceIntA;
 	int* deviceIntB;
 	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
 		return;
 	}
 	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
 		return;
 	}
 	// Allocate space on the device
 	auto start_time = std::chrono::high_resolution_clock::now();
 	for (int i = 0; i < N; ++i) {
 		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
 	}
 	auto end_time = std::chrono::high_resolution_clock::now();
 	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
 	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
 	// Free memory on device
 	if (cudaFree(deviceIntA) != cudaSuccess) {
 		std::cout << "Error in cudaFree!" << std::endl;
 		return;
 	}
 	if (cudaFree(deviceIntB) != cudaSuccess) {
 		std::cout << "Error in cudaFree!" << std::endl;
 		return;
 	}
 }
--- a/cuda/kernels/kernelSwitchTest.h
+++ b/cuda/kernels/kernelSwitchTest.h
@ -0,0 +1 @@
 void kernelSwitchTest(size_t N);
--- a/cuda/kernels/utility.cu
+++ b/cuda/kernels/utility.cu
@ -0,0 +1,33 @@
 #include "utility.h"
 #include <cuda_runtime.h>
 size_t getFreeCudaMemory() {
 	size_t freeMemory;
 	size_t totalMemory;
 	cudaMemGetInfo(&freeMemory, &totalMemory);
 	return freeMemory;
 }
 size_t getTotalCudaMemory() {
 	size_t freeMemory;
 	size_t totalMemory;
 	cudaMemGetInfo(&freeMemory, &totalMemory);
 	return totalMemory;
 }
 bool resetCudaDevice() {
 	cudaError_t result = cudaDeviceReset();
 	return (result == cudaSuccess);
 }
 int getRuntimeCudaVersion() {
 	int result = -1;
 	cudaError_t errorResult = cudaRuntimeGetVersion(&result);
 	if (errorResult != cudaSuccess) {
 		return -1;
 	}
 	return result;
 }
--- a/cuda/kernels/utility.h
+++ b/cuda/kernels/utility.h
@ -0,0 +1,12 @@
 #ifndef STORM_CUDAFORSTORM_UTILITY_H_
 #define STORM_CUDAFORSTORM_UTILITY_H_
 // Library exports
 #include "cudaForStorm.h"
 size_t getFreeCudaMemory();
 size_t getTotalCudaMemory();
 bool resetCudaDevice();
 int getRuntimeCudaVersion();
 #endif // STORM_CUDAFORSTORM_UTILITY_H_
--- a/cuda/kernels/version.cu
+++ b/cuda/kernels/version.cu
@ -0,0 +1,28 @@
 #include "version.h"
 #include "storm-cudaplugin-config.h"
 size_t getStormCudaPluginVersionMajor() {
 	return STORM_CUDAPLUGIN_VERSION_MAJOR;
 }
 size_t getStormCudaPluginVersionMinor() {
 	return STORM_CUDAPLUGIN_VERSION_MINOR;
 }
 size_t getStormCudaPluginVersionPatch() {
 	return STORM_CUDAPLUGIN_VERSION_PATCH;
 }
 size_t getStormCudaPluginVersionCommitsAhead() {
 	return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD;
 }
 const char* getStormCudaPluginVersionHash() {
 	static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH;
 	return versionHash.c_str();
 }
 bool getStormCudaPluginVersionIsDirty() {
 	return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0);
 }
--- a/cuda/kernels/version.h
+++ b/cuda/kernels/version.h
@ -0,0 +1,16 @@
 #ifndef STORM_CUDAFORSTORM_VERSION_H_
 #define STORM_CUDAFORSTORM_VERSION_H_
 // Library exports
 #include "cudaForStorm.h"
 #include <string>
 size_t getStormCudaPluginVersionMajor();
 size_t getStormCudaPluginVersionMinor();
 size_t getStormCudaPluginVersionPatch();
 size_t getStormCudaPluginVersionCommitsAhead();
 const char* getStormCudaPluginVersionHash();
 bool getStormCudaPluginVersionIsDirty();
 #endif // STORM_CUDAFORSTORM_VERSION_H_
--- a/cuda/storm-cudaplugin-config.h.in
+++ b/cuda/storm-cudaplugin-config.h.in
@ -0,0 +1,21 @@
 /*
 * StoRM - Build-in Options
 *
 * This file is parsed by CMake during makefile generation
 */
 #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
 #define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
 // Version Information
 #define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@					// The major version of StoRM
 #define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@					// The minor version of StoRM
 #define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@					// The patch version of StoRM
 #define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ 	// How many commits passed since the tag was last set
 #define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" 					// The short hash of the git commit this build is bases on
 #define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ 					// 0 iff there no files were modified in the checkout, 1 else
 // Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit
 #@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
 #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
--- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp
@ -66,7 +66,7 @@ namespace storm {
                }
                // Determine whether the method converged.
                converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *newX, precision, relative);
                converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *newX, static_cast<ValueType>(precision), relative);
                // Update environment variables.
                std::swap(currentX, newX);
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -16,7 +16,7 @@
 extern log4cplus::Logger logger;
 #include "storm-config.h"
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 #	include "cudaForStorm.h"
 #endif
@ -84,7 +84,7 @@ namespace storm {
 			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
 			// Check if the decomposition is necessary
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 #define __USE_CUDAFORSTORM_OPT true
 			size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A.getEntryCount()));
 			size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
@ -98,7 +98,7 @@ namespace storm {
 				// Dummy output for SCC Times
 				//std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 				if (!resetCudaDevice()) {
 					LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
 					throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
@ -108,9 +108,9 @@ namespace storm {
 				bool result = false;
 				size_t globalIterations = 0;
 				if (minimize) {
 					result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
 					result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
 				} else {
 					result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
 					result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
 				}
 				LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
@ -217,7 +217,7 @@ namespace storm {
 					// For the current SCC, we need to perform value iteration until convergence.
 					if (useGpu) {
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 						if (!resetCudaDevice()) {
 							LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
 							throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
@ -230,9 +230,9 @@ namespace storm {
 						bool result = false;
 						localIterations = 0;
 						if (minimize) {
 							result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
 							result = __basicValueIteration_mvReduce_minimize<uint_fast64_t, ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
 						} else {
 							result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
 							result = __basicValueIteration_mvReduce_maximize<uint_fast64_t, ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
 						}
 						LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
@ -284,7 +284,7 @@ namespace storm {
 							// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
 							// running time. In fact, it is faster. This has to be investigated.
 							// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
 							converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
 							converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, static_cast<ValueType>(this->precision), this->relative);
 							// Update environment variables.
 							std::swap(currentX, swap);
@ -332,7 +332,7 @@ namespace storm {
 		std::vector<std::pair<bool, storm::storage::StateBlock>>
 			TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
 				std::vector<std::pair<bool, storm::storage::StateBlock>> result;
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 				// 95% to have a bit of padding
 				size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
 				size_t lastResultIndex = 0;
@ -395,7 +395,7 @@ namespace storm {
 									}
 									std::sort(tempGroups.begin(), tempGroups.end());
 								}
 								result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
 								result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
 							} else {
 								// Only one group, copy construct.
 								result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
@ -449,7 +449,7 @@ namespace storm {
 							}
 							std::sort(tempGroups.begin(), tempGroups.end());
 						}						
 						result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
 						result.push_back(std::make_pair(true, storm::storage::StateBlock(tempGroups.cbegin(), tempGroups.cend())));
 					}
 					else {
 						// Only one group, copy construct.
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
@ -4,13 +4,15 @@
 #include "src/solver/NativeNondeterministicLinearEquationSolver.h"
 #include "src/storage/StronglyConnectedComponentDecomposition.h"
 #include "src/storage/SparseMatrix.h"
 #include "src/exceptions/NotImplementedException.h"
 #include "src/exceptions/NotSupportedException.h"
 #include <utility>
 #include <vector>
 #include "storm-config.h"
 #ifdef STORM_HAVE_CUDAFORSTORM
 #	include "cudaForStorm.h"
 #ifdef STORM_HAVE_CUDA
 #include "cudaForStorm.h"
 #endif
 namespace storm {
@ -49,46 +51,46 @@ namespace storm {
        };
 		template <typename IndexType, typename ValueType>
 		bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 		bool __basicValueIteration_mvReduce_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 			//
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
 		}
 		template <>
 		inline bool __basicValueIteration_mvReduce_uint64_minimize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDAFORSTORM
 		inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDA
 			return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 #else
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "StoRM is compiled without CUDA support.");
 #endif
 		}
 		template <>
 		inline bool __basicValueIteration_mvReduce_uint64_minimize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDAFORSTORM
 		inline bool __basicValueIteration_mvReduce_minimize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDA
 			return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 #else
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "StoRM is compiled without CUDA support.");
 #endif
 		}
 		template <typename IndexType, typename ValueType>
 		bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 		bool __basicValueIteration_mvReduce_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 			//
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotImplementedException, "Unsupported template arguments.");
 		}
 		template <>
 		inline bool __basicValueIteration_mvReduce_uint64_maximize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDAFORSTORM
 		inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDA
 			return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 #else
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "StoRM is compiled without CUDA support.");
 #endif
 		}
 		template <>
 		inline bool __basicValueIteration_mvReduce_uint64_maximize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDAFORSTORM
 		inline bool __basicValueIteration_mvReduce_maximize<uint_fast64_t, float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
 #ifdef STORM_HAVE_CUDA
 			return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
 #else
 			throw;
 			STORM_LOG_THROW(false, storm::exceptions::NotSupportedException, "StoRM is compiled without CUDA support.");
 #endif
 		}
    } // namespace solver
--- a/src/storage/SparseMatrix.cpp
+++ b/src/storage/SparseMatrix.cpp
@ -351,6 +351,15 @@ namespace storm {
            return entryCount;
        }
 		template<typename T>
 		uint_fast64_t SparseMatrix<T>::getRowGroupEntryCount(uint_fast64_t const group) const {
 			uint_fast64_t result = 0;
 			for (uint_fast64_t row = this->getRowGroupIndices()[group]; row < this->getRowGroupIndices()[group + 1]; ++row) {
 				result += (this->rowIndications[row + 1] - this->rowIndications[row]);
 			}
 			return result;
 		}
        template<typename ValueType>
        typename SparseMatrix<ValueType>::index_type SparseMatrix<ValueType>::getNonzeroEntryCount() const {
            return nonzeroEntryCount;
--- a/src/storage/SparseMatrix.h
+++ b/src/storage/SparseMatrix.h
@ -23,6 +23,10 @@ namespace storm {
 		class EigenAdapter;
 		class StormAdapter;
 	}
 	namespace solver {
 		template<typename T>
 		class TopologicalValueIterationNondeterministicLinearEquationSolver;
 	}
 }
 namespace storm {
@ -273,6 +277,7 @@ namespace storm {
            friend class storm::adapters::GmmxxAdapter;
            friend class storm::adapters::EigenAdapter;
            friend class storm::adapters::StormAdapter;
 			friend class storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>;
            typedef uint_fast64_t index_type;
            typedef ValueType value_type;
@ -454,6 +459,13 @@ namespace storm {
             */
            index_type getEntryCount() const;
 			/*!
 			* Returns the number of entries in the given row group of the matrix.
 			*
 			* @return The number of entries in the given row group of the matrix.
 			*/
 			uint_fast64_t getRowGroupEntryCount(uint_fast64_t const group) const;
            /*!
             * Returns the number of nonzero entries in the matrix.
             *
--- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
+++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -110,7 +110,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = mc.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 7.333329499),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -126,7 +126,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = mc.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 7.333329499),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -147,7 +147,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = stateRewardModelChecker.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 7.333329499),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -162,7 +162,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = stateRewardModelChecker.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 7.333329499),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -183,7 +183,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = stateAndTransitionRewardModelChecker.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 14.666658998),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -198,7 +198,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	result = stateAndTransitionRewardModelChecker.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 14.666658998),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -266,7 +266,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) {
 	result = mc.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 4.285689611),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else
@ -282,7 +282,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) {
 	result = mc.check(*rewardFormula);
 #ifdef STORM_HAVE_CUDAFORSTORM
 #ifdef STORM_HAVE_CUDA
 	ASSERT_LT(std::abs(result->asExplicitQuantitativeCheckResult<double>()[0] - 4.285689611),
 		storm::settings::topologicalValueIterationEquationSolverSettings().getPrecision());
 #else