Merge branch 'philippTopologicalRevival' into cuda_integration

Conflicts: src/storage/Decomposition.h src/storage/SparseMatrix.cpp src/storage/SparseMatrix.h src/storage/StronglyConnectedComponentDecomposition.cpp src/storage/StronglyConnectedComponentDecomposition.h src/storm.cpp test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp Former-commit-id: 27e660a295
11 years ago · 06dfecda55
41 changed files with 4329 additions and 110 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "resources/3rdparty/cusplibrary"]
+	path = resources/3rdparty/cusplibrary
+	url = https://github.com/cusplibrary/cusplibrary.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -27,6 +27,7 @@ option(STORM_USE_INTELTBB "Sets whether the Intel TBB libraries should be used."
 option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precompiled headers)." OFF)
 option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF)
 option(USE_LIBCXX "Sets whether the standard library is libc++." OFF)
+option(STORM_USE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF)
 set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).")
 set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).")
 set(CUDA_ROOT "" CACHE STRING "The root directory of CUDA.")
@ -212,6 +213,13 @@ endif()
 # glpk defines
 set(STORM_CPP_GLPK_DEF "define")

+# CUDA Defines
+if (STORM_USE_CUDAFORSTORM)
+	set(STORM_CPP_CUDAFORSTORM_DEF "define")
+else()
+	set(STORM_CPP_CUDAFORSTORM_DEF "undef")
+endif()
+
 # Z3 Defines
 if (ENABLE_Z3)
 	set(STORM_CPP_Z3_DEF "define")
@ -352,6 +360,9 @@ endif()
 if (ENABLE_Z3)
    link_directories("${Z3_ROOT}/bin")
 endif()
+if (STORM_USE_CUDAFORSTORM)
+	link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib")
+endif()
 if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL ""))
 	set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib")
 endif ()
@ -386,6 +397,20 @@ target_link_libraries(storm ${Boost_LIBRARIES})
 #message(STATUS "BOOST_INCLUDE_DIRS is ${Boost_INCLUDE_DIRS}")
 #message(STATUS "BOOST_LIBRARY_DIRS is ${Boost_LIBRARY_DIRS}")

+#############################################################
+##
+##	CUDA For Storm
+##
+#############################################################
+if (STORM_USE_CUDAFORSTORM)
+    message (STATUS "StoRM - Linking with CudaForStorm")
+	include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include")
+	include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm")
+    target_link_libraries(storm cudaForStorm)
+    target_link_libraries(storm-functional-tests cudaForStorm)
+    target_link_libraries(storm-performance-tests cudaForStorm)
+endif(STORM_USE_CUDAFORSTORM)
+
 #############################################################
 ##
 ##	CUDD
--- a/examples/mdp/scc/scc.pctl
+++ b/examples/mdp/scc/scc.pctl
@ -0,0 +1,2 @@
+Pmin=? [ (!statetwo) U end ]
+Pmax=? [ (!statetwo) U end ]
--- a/resources/3rdparty/cusplibrary
+++ b/resources/3rdparty/cusplibrary
@ -0,0 +1 @@
+Subproject commit d8d7d9e97add8db08ef0ad5c0a7e9929fd83ce3c
--- a/resources/cmake/FindCusp.cmake
+++ b/resources/cmake/FindCusp.cmake
@ -0,0 +1,56 @@
+#
+# FindCusp
+#
+# This module finds the CUSP header files and extracts their version.  It
+# sets the following variables.
+#
+# CUSP_INCLUDE_DIR -  Include directory for cusp header files.  (All header
+#                       files will actually be in the cusp subdirectory.)
+# CUSP_VERSION -      Version of cusp in the form "major.minor.patch".
+#
+# CUSP_FOUND - Indicates whether Cusp has been found
+#
+
+find_path(CUSP_INCLUDE_DIR
+	HINTS
+		/usr/include/cusp
+		/usr/local/include
+		/usr/local/cusp/include
+		${CUSP_INCLUDE_DIRS}
+		${CUSP_HINT}
+	NAMES cusp/version.h
+	DOC "Cusp headers"
+)
+if(CUSP_INCLUDE_DIR)
+	list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR)
+		
+	# Find cusp version
+	file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h
+		version
+		REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)"
+	)
+	string(REGEX REPLACE
+		"#define CUSP_VERSION[ \t]+"
+		""
+		version
+		"${version}"
+	)
+
+	#define CUSP_MAJOR_VERSION     (CUSP_VERSION / 100000)
+	#define CUSP_MINOR_VERSION     (CUSP_VERSION / 100 % 1000)
+	#define CUSP_SUBMINOR_VERSION  (CUSP_VERSION % 100)
+
+	math(EXPR CUSP_MAJOR_VERSION "${version} / 100000")
+	math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000")
+	math(EXPR CUSP_PATCH_VERSION "${version} % 100")
+
+	set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}")
+
+	# Check for required components
+	include(FindPackageHandleStandardArgs)
+	find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION)
+
+	set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR})
+	mark_as_advanced(CUSP_INCLUDE_DIR)
+	
+endif(CUSP_INCLUDE_DIR)
--- a/resources/cmake/FindThrust.cmake
+++ b/resources/cmake/FindThrust.cmake
@ -0,0 +1,52 @@
+#
+# FindThrust
+#
+# This module finds the Thrust header files and extracts their version.  It
+# sets the following variables.
+#
+# THRUST_INCLUDE_DIR -  Include directory for thrust header files.  (All header
+#                       files will actually be in the thrust subdirectory.)
+# THRUST_VERSION -      Version of thrust in the form "major.minor.patch".
+#
+# Thrust_FOUND - Indicates whether Thrust has been found
+#
+
+find_path(THRUST_INCLUDE_DIR
+	HINTS
+		/usr/include/cuda
+		/usr/local/include
+		/usr/local/cuda/include
+		${CUDA_INCLUDE_DIRS}
+	NAMES thrust/version.h
+	DOC "Thrust headers"
+)
+if(THRUST_INCLUDE_DIR)
+	list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR)
+endif(THRUST_INCLUDE_DIR)
+
+# Find thrust version
+file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
+	version
+	REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
+)
+string(REGEX REPLACE
+	"#define THRUST_VERSION[ \t]+"
+	""
+	version
+	"${version}"
+)
+
+string(REGEX MATCH "^[0-9]" major ${version})
+string(REGEX REPLACE "^${major}00" "" version "${version}")
+string(REGEX MATCH "^[0-9]" minor ${version})
+string(REGEX REPLACE "^${minor}0" "" version "${version}")
+set(THRUST_VERSION "${major}.${minor}.${version}")
+set(THRUST_MAJOR_VERSION "${major}")
+set(THRUST_MINOR_VERSION "${minor}")
+
+# Check for required components
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION)
+
+set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR})
+mark_as_advanced(THRUST_INCLUDE_DIR)
--- a/resources/cudaForStorm/CMakeAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp
@ -0,0 +1,64 @@
+/*
+ * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
+ */
+ #include <cstdint>
+ #include <utility>
+ #include <vector>
+ 
+ #define CONTAINER_SIZE 100ul
+ 
+ template <typename IndexType, typename ValueType>
+ int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
+	std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
+	for (size_t i = 0; i < containerSize; ++i) {
+		myVector->push_back(std::make_pair(firstValue, secondValue));
+	}
+	size_t myVectorSize = myVector->size();
+	IndexType* firstStart = &(myVector->at(0).first);
+	IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
+	ValueType* secondStart = &(myVector->at(0).second);
+	ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
+	size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
+	size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
+	size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
+	size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
+	
+	delete myVector;
+	myVector = nullptr;
+	
+	if (myVectorSize != containerSize) {
+		return -2;
+	}
+	
+	// Check for alignment:
+	// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
+	if (sizeof(IndexType) != sizeof(ValueType)) {
+		return -3;
+	}
+	if (startOffset != sizeof(IndexType)) {
+		return -4;
+	}
+	if (endOffset != sizeof(IndexType)) {
+		return -5;
+	}
+	if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
+		return -6;
+	}
+	if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
+		return -7;
+	}
+	
+	return 0;
+ }
+ 
+ 
+ int main(int argc, char* argv[]) {
+	int result = 0;
+	
+	result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
+	if (result != 0) {
+		return result;
+	}
+	
+	return 0;
+ }
--- a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
@ -0,0 +1,31 @@
+/*
+ * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries
+ */
+ #include <cstdint>
+ #include <utility>
+ #include <vector>
+ 
+ #define CONTAINER_SIZE 100ul
+
+int main(int argc, char* argv[]) {
+	int result = 0;
+
+	std::vector<std::pair<uint_fast64_t, float>> myVector;
+	for (size_t i = 0; i < CONTAINER_SIZE; ++i) {
+		myVector.push_back(std::make_pair(i, 42.12345f * i));
+	}
+	
+	char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first));
+	char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first));
+	ptrdiff_t uintDiff = secondUintPointer - firstUintPointer;
+	
+	if (uintDiff == (2 * sizeof(uint_fast64_t))) {
+		result = 2;
+	} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) {
+		result = 3;
+	} else {
+		result = -5;
+	}
+	
+	return result;
+ }
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -0,0 +1,294 @@
+cmake_minimum_required (VERSION 2.8.6)
+
+# Set project name
+project (cudaForStorm CXX C)
+
+# Set the version number
+set (STORM_CPP_VERSION_MAJOR 1)
+set (STORM_CPP_VERSION_MINOR 0)
+
+# Add base folder for better inclusion paths
+include_directories("${PROJECT_SOURCE_DIR}")
+include_directories("${PROJECT_SOURCE_DIR}/src")
+
+message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}")
+
+#############################################################
+##
+##	CMake options of StoRM
+##
+#############################################################
+option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON)
+option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF)
+option(USE_LIBCXX "Sets whether the standard library is libc++." OFF)
+
+set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.")
+set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.")
+set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).")
+
+#############################################################
+##
+##	Inclusion of required libraries
+##
+#############################################################
+
+# Add the resources/cmake folder to Module Search Path for FindTBB.cmake
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")
+
+# Set the hint for CUSP
+set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary")
+
+find_package(CUDA REQUIRED)
+find_package(Cusp REQUIRED)
+find_package(Doxygen REQUIRED)
+find_package(Thrust REQUIRED)
+
+# If the DEBUG option was turned on, we will target a debug version and a release version otherwise
+if (CUDAFORSTORM_DEBUG)
+    set (CMAKE_BUILD_TYPE "DEBUG")
+else()
+    set (CMAKE_BUILD_TYPE "RELEASE")
+endif()
+message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.")
+
+message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
+message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}")
+
+#############################################################
+##
+##	CUDA Options
+##
+#############################################################
+SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE)
+set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
+set(BUILD_SHARED_LIBS OFF)
+set(CUDA_SEPARABLE_COMPILATION ON)
+#set(CUDA_NVCC_FLAGS "-arch=sm_30")
+
+# Because the FindCUDA.cmake file has a path related bug, two folders have to be present
+file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug")
+file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release")
+
+
+#############################################################
+##
+##	Compiler specific settings and definitions
+##
+#############################################################
+if(CMAKE_COMPILER_IS_GNUCC)
+    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC")
+    # Set standard flags for GCC
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic")
+elseif(MSVC)
+    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC")
+	# required for GMM to compile, ugly error directive in their code
+	add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS)
+	# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx)
+	add_definitions(/bigobj)
+	# required by GTest and PrismGrammar::createIntegerVariable
+	add_definitions(/D_VARIADIC_MAX=10)
+	# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max
+	add_definitions(/DNOMINMAX)
+else(CLANG)
+    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)")
+	# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler.
+	set (CLANG ON)
+    # Set standard flags for clang
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3")
+    if(UNIX AND NOT APPLE AND NOT USE_LIBCXX)
+		set(CLANG_STDLIB libstdc++)
+		message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++")
+    else()
+		set(CLANG_STDLIB libc++)
+		message(STATUS "StoRM (CudaPlugin) - Linking against libc++")
+		# Set up some Xcode specific settings
+		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11")
+		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
+    endif()
+    
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024")
+    
+    set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+endif()
+
+#############################################################
+##
+##	CMake-generated Config File for StoRM
+##
+#############################################################
+
+# Test for type alignment
+try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
+	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp"
+	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
+)
+if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
+elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
+	message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
+endif()
+
+# Test for Float 64bit Alignment
+try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT
+	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp"
+	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
+)
+if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT)
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
+elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2)
+	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.")
+	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define")
+elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3)
+	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.")
+	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})")
+endif()
+
+
+#
+# Make a version file containing the current version from git.
+#
+include(GetGitRevisionDescription)
+git_describe_checkout(STORM_GIT_VERSION_STRING)
+# Parse the git Tag into variables
+string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}")
+string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}")
+string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}")
+string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}")
+string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}")
+string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}")
+if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$")
+	set(STORM_CUDAPLUGIN_VERSION_DIRTY 1)
+else()
+	set(STORM_CUDAPLUGIN_VERSION_DIRTY 0)
+endif()
+message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})")
+
+
+# Configure a header file to pass some of the CMake settings to the source code
+configure_file (
+	"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in"
+	"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
+)
+# Add the binary dir include directory for storm-config.h
+include_directories("${PROJECT_BINARY_DIR}/include")
+
+# Add the main source directory for includes
+include_directories("${PROJECT_SOURCE_DIR}/../../src")
+
+#############################################################
+##
+##	Source file aggregation and clustering
+##
+#############################################################
+file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h)
+file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp)
+
+file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu")
+file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h")
+
+# Additional include files like the storm-config.h
+file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h)
+
+# Group the headers and sources
+source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
+source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS})
+
+# Add custom additional include or link directories
+if (ADDITIONAL_INCLUDE_DIRS)
+	message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}")
+	include_directories(${ADDITIONAL_INCLUDE_DIRS})
+endif(ADDITIONAL_INCLUDE_DIRS)
+if (ADDITIONAL_LINK_DIRS)
+	message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}")
+	link_directories(${ADDITIONAL_LINK_DIRS})
+endif(ADDITIONAL_LINK_DIRS)
+
+#############################################################
+##
+##	Pre executable-creation link_directories setup
+##
+#############################################################
+
+
+
+#############################################################
+##
+##	CUDA
+##
+#############################################################
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
+#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
+#ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
+#target_link_libraries(cudaForStorm cudaLibrary)
+message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}")
+include_directories(${CUDA_INCLUDE_DIRS})
+
+#############################################################
+##
+##	CUSP
+##
+#############################################################
+if(CUSP_FOUND)
+	include_directories(${CUSP_INCLUDE_DIR})
+	cuda_include_directories(${CUSP_INCLUDE_DIR})
+	message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
+endif()
+
+#############################################################
+##
+##	Thrust
+##
+#############################################################
+if(THRUST_FOUND)
+	include_directories(${THRUST_INCLUDE_DIR})
+	cuda_include_directories(${THRUST_INCLUDE_DIR})
+	message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
+endif()
+
+###############################################################################
+##                                                                            #
+##	Executable Creation                                                       #
+##                                                                            #
+##  All link_directories() calls AND include_directories() calls              #
+##  MUST be made before this point                                            #
+##                                                                            #
+###############################################################################
+include (GenerateExportHeader)
+
+cuda_add_library(cudaForStorm SHARED
+  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
+  OPTIONS -DSTUFF="" -arch=sm_30
+  RELEASE -DNDEBUG
+  DEBUG -g -DDEBUG
+)
+GENERATE_EXPORT_HEADER( cudaForStorm
+             BASE_NAME cudaForStorm
+             EXPORT_MACRO_NAME cudaForStorm_EXPORT
+             EXPORT_FILE_NAME include/cudaForStorm_Export.h
+             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
+)
+
+if (MSVC)
+	# Add the DebugHelper DLL
+	set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib")
+	target_link_libraries(cudaForStorm "Dbghelp.lib")   
+endif(MSVC)
+
+# Link against libc++abi if requested. May be needed to build on Linux systems using clang.
+if (LINK_LIBCXXABI)
+	message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.")
+	target_link_libraries(cudaForStorm "c++abi")
+endif(LINK_LIBCXXABI)
+
+# Install Directive
+install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib")
+install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include")
--- a/resources/cudaForStorm/srcCuda/allCudaKernels.h
+++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h
@ -0,0 +1,6 @@
+#include "utility.h"
+#include "bandWidth.h"
+#include "basicAdd.h"
+#include "kernelSwitchTest.h"
+#include "basicValueIteration.h"
+#include "version.h"
--- a/resources/cudaForStorm/srcCuda/bandWidth.cu
+++ b/resources/cudaForStorm/srcCuda/bandWidth.cu
--- a/resources/cudaForStorm/srcCuda/bandWidth.h
+++ b/resources/cudaForStorm/srcCuda/bandWidth.h
--- a/resources/cudaForStorm/srcCuda/basicAdd.cu
+++ b/resources/cudaForStorm/srcCuda/basicAdd.cu
@ -0,0 +1,286 @@
+#include <cuda.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <chrono>
+#include <iostream>
+
+__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { 
+	*c = a + b; 
+}
+
+__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	// Fused Multiply Add:
+	// A * B + C => D
+
+	/*
+     *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd
+     *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche
+     *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente
+     *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des
+     *Threads, der die Funktion gerade ausführt.
+    */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (i < N) {
+		D[i] = A[i] * B[i] + C[i];
+	}
+}
+
+__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) {
+	// Fused Multiply Add:
+	// A * B + C => D
+
+	// Layout:
+	// A B C D A B C D A B C D
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if ((i*M) < N) {
+		for (int j = i*M; j < i*M + M; ++j) {
+			A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2];
+		}
+	}
+}
+
+extern "C" int cuda_basicAdd(int a, int b) {
+	int c = 0;
+	int *dev_c;
+	cudaMalloc((void**)&dev_c, sizeof(int));
+	cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c);
+	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
+	//printf("%d + %d + 42 is %d\n", a, b, c);
+	cudaFree(dev_c);
+	return c;
+}
+
+void cpp_cuda_bandwidthTest(int entryCount, int N) {
+	// Size of the Arrays
+	size_t arraySize = entryCount * sizeof(int);
+	
+	int* deviceIntArray;
+	int* hostIntArray = new int[arraySize];
+
+	// Allocate space on the device
+	auto start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
+			std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl;
+			delete[] hostIntArray;
+			return;
+		}
+		// Free memory on device
+		if (cudaFree(deviceIntArray) != cudaSuccess) {
+			std::cout << "Error in cudaFree!" << std::endl;
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	auto end_time = std::chrono::high_resolution_clock::now();
+	auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl;
+
+	if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl;
+		delete[] hostIntArray;
+		return;
+	}
+	
+	// Prepare data
+	for (int i = 0; i < N; ++i) {
+		hostIntArray[i] = i * 333 + 123;
+	}
+
+	// Copy data TO device
+	start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl;
+			// Free memory on device
+			if (cudaFree(deviceIntArray) != cudaSuccess) {
+				std::cout << "Error in cudaFree!" << std::endl;
+			}
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	end_time = std::chrono::high_resolution_clock::now();
+	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl;
+
+	// Copy data FROM device
+	start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl;
+			// Free memory on device
+			if (cudaFree(deviceIntArray) != cudaSuccess) {
+				std::cout << "Error in cudaFree!" << std::endl;
+			}
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	end_time = std::chrono::high_resolution_clock::now();
+	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl;
+
+	// Free memory on device
+	if (cudaFree(deviceIntArray) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+	}
+	delete[] hostIntArray;
+}
+
+extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	// Size of the Arrays
+	size_t arraySize = N * sizeof(int);
+	
+	int* deviceIntArrayA;
+	int* deviceIntArrayB;
+	int* deviceIntArrayC;
+	int* deviceIntArrayD;
+
+	// Allocate space on the device
+	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc1!\n");
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc2!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc3!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc4!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		return;
+	}
+	
+	// Copy data TO device
+	if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	
+    // Festlegung der Threads pro Block
+    int threadsPerBlock = 512;
+    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
+    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+	// Run kernel
+	cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N);
+
+	// Copy data FROM device
+	if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+
+	// Free memory on device
+	cudaFree(deviceIntArrayA);
+	cudaFree(deviceIntArrayB);
+	cudaFree(deviceIntArrayC);
+	cudaFree(deviceIntArrayD);
+}
+
+extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) {
+	// Size of the Arrays
+	size_t arraySize = N * sizeof(int) * 4;
+	
+	int* deviceIntArrayA;
+
+	// Allocate space on the device
+	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc1!\n");
+		return;
+	}
+
+#define ONFAILFREE0() do { } while(0)
+#define ONFAILFREE1(a) do { cudaFree(a); } while(0)
+#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0)
+#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0)
+#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0)
+#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0)
+
+	// Copy data TO device
+
+	CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice);
+
+	/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}*/
+	
+    // Festlegung der Threads pro Block
+    int threadsPerBlock = 512;
+    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
+    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+	// Run kernel
+	cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M);
+
+	// Copy data FROM device
+	if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}
+
+	// Free memory on device
+	if (cudaFree(deviceIntArrayA) != cudaSuccess) {
+		printf("Error in cudaFree!\n");
+		return;
+	}
+}
+
+extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	for (int i = 0; i < N; ++i) {
+		D[i] = A[i] * B[i] + C[i];
+	}
+}
+
+extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) {
+	for (int i = 0; i < N; i += 4) {
+		A[i+3] = A[i] * A[i+1] + A[i+2];
+	}
+}
--- a/resources/cudaForStorm/srcCuda/basicAdd.h
+++ b/resources/cudaForStorm/srcCuda/basicAdd.h
@ -0,0 +1,9 @@
+extern "C" int cuda_basicAdd(int a, int b);
+
+extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M);
+extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N);
+
+extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N);
+extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N);
+
+void cpp_cuda_bandwidthTest(int entryCount, int N);
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -0,0 +1,879 @@
+#include "basicValueIteration.h"
+#define CUSP_USE_TEXTURE_MEMORY
+
+#include <iostream>
+#include <chrono>
+
+#include <cuda_runtime.h>
+#include "cusparse_v2.h"
+
+#include "utility.h"
+
+#include "cuspExtension.h"
+
+#include <thrust/transform.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+
+#include "storm-cudaplugin-config.h"
+
+#ifdef DEBUG
+#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync  = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize();	if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
+#else
+#define CUDA_CHECK_ALL_ERRORS() do {} while (false)
+#endif
+
+template<typename T, bool Relative>
+struct equalModuloPrecision : public thrust::binary_function<T,T,T>
+{
+__host__ __device__ T operator()(const T &x, const T &y) const
+{
+    if (Relative) {
+		if (y == 0) {
+			return ((x >= 0) ? (x) : (-x));
+		}
+		const T result = (x - y) / y;
+		return ((result >= 0) ? (result) : (-result));
+    } else {
+        const T result = (x - y);
+		return ((result >= 0) ? (result) : (-result));
+    }
+}
+};
+
+template<typename IndexType, typename ValueType>
+void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
+	indexVector.reserve(inputVector.size());
+	valueVector.reserve(inputVector.size());
+	for (size_t i = 0; i < inputVector.size(); ++i) {
+		indexVector.push_back(inputVector.at(i).first);
+		valueVector.push_back(inputVector.at(i).second);
+	}
+}
+
+// TEMPLATE VERSION
+template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
+bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	//std::vector<IndexType> matrixColumnIndices;
+	//std::vector<ValueType> matrixValues;
+	//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
+	bool errorOccured = false;
+
+	IndexType* device_matrixRowIndices = nullptr;
+	ValueType* device_matrixColIndicesAndValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;
+
+#ifdef DEBUG
+	std::cout.sync_with_stdio(true);
+	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
+	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
+	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
+	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
+#endif
+
+	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
+	const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
+	const IndexType matrixNnzCount = columnIndicesAndValues.size();
+
+	cudaError_t cudaMallocResult;
+
+	bool converged = false;
+	iterationCount = 0;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
+#else
+#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
+#endif
+	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
+		if (cudaMallocResult != cudaSuccess) {
+			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	} else {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
+		if (cudaMallocResult != cudaSuccess) {
+			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1));
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished allocating memory." << std::endl;
+#endif
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	// Copy all data as floats are expanded to 64bits :/
+	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+		if (cudaCopyResult != cudaSuccess) {
+			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	} else {
+		CUDA_CHECK_ALL_ERRORS();
+		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+		if (cudaCopyResult != cudaSuccess) {
+			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+			errorOccured = true;
+			goto cleanup;
+		}
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	// Preset the xSwap to zeros...
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	// Preset the multiplyResult to zeros...
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
+#endif
+
+	// Data is on device, start Kernel
+	while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
+		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+		CUDA_CHECK_ALL_ERRORS();
+
+		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
+		thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
+
+		// Transform: Add multiplyResult + b inplace to multiplyResult
+		thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Reduce: Reduce multiplyResult to a new x vector
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Check for convergence
+		// Transform: x = abs(x - xSwap)/ xSwap
+		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
+		thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
+		thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
+		thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Reduce: get Max over x and check for res < Precision
+		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+		converged = (maxX < precision);
+		++iterationCount;
+
+		// Swap pointers, device_x always contains the most current result
+		std::swap(device_x, device_xSwap);
+	}
+
+	if (!converged && (iterationCount == maxIterationCount)) {
+		iterationCount = 0;
+		errorOccured = true;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished kernel execution." << std::endl;
+	std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
+#endif
+
+	// Get x back from the device
+	cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
+		errorOccured = true;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying result data." << std::endl;
+#endif
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_matrixRowIndices != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_matrixRowIndices = nullptr;
+	}
+	if (device_matrixColIndicesAndValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_matrixColIndicesAndValues = nullptr;
+	}
+	if (device_x != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_x);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_x = nullptr;
+	}
+	if (device_xSwap != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_xSwap);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_xSwap = nullptr;
+	}
+	if (device_b != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_b);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_b = nullptr;
+	}
+	if (device_multiplyResult != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_multiplyResult = nullptr;
+	}
+	if (device_nondeterministicChoiceIndices != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl;
+			errorOccured = true;
+		}
+		device_nondeterministicChoiceIndices = nullptr;
+	}
+#ifdef DEBUG
+	std::cout << "(DLL) Finished cleanup." << std::endl;
+#endif
+
+	return !errorOccured;
+}
+
+template <typename IndexType, typename ValueType>
+void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
+	IndexType* device_matrixRowIndices = nullptr;
+	ValueType* device_matrixColIndicesAndValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+
+#ifdef DEBUG
+	std::cout.sync_with_stdio(true);
+	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
+	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
+	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
+	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
+#endif
+
+	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
+	const IndexType matrixNnzCount = columnIndicesAndValues.size();
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+#else
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+#endif
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished allocating memory." << std::endl;
+#endif
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+#else
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+#endif
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	// Preset the multiplyResult to zeros...
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
+#endif
+
+	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+	CUDA_CHECK_ALL_ERRORS();
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished kernel execution." << std::endl;
+#endif
+
+	// Get result back from the device
+	cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+#ifdef DEBUG
+	std::cout << "(DLL) Finished copying result data." << std::endl;
+#endif
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_matrixRowIndices != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_matrixRowIndices = nullptr;
+	}
+	if (device_matrixColIndicesAndValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_matrixColIndicesAndValues = nullptr;
+	}
+	if (device_x != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_x);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_x = nullptr;
+	}
+	if (device_multiplyResult != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_multiplyResult = nullptr;
+	}
+#ifdef DEBUG
+	std::cout << "(DLL) Finished cleanup." << std::endl;
+#endif
+}
+
+template <typename ValueType>
+void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
+	ValueType* device_a = nullptr;
+	ValueType* device_b = nullptr;
+
+	const size_t vectorSize = std::max(a.size(), b.size());
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Transform: Add multiplyResult + b inplace to multiplyResult
+		thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
+		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
+		thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// Get result back from the device
+	cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_a != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_a);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_a = nullptr;
+	}
+	if (device_b != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_b);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_b = nullptr;
+	}
+}
+
+template <typename IndexType, typename ValueType, bool Minimize>
+void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
+	ValueType* device_groupedVector = nullptr;
+	IndexType* device_grouping = nullptr;
+	ValueType* device_target = nullptr;
+
+	const size_t groupedSize = groupedVector.size();
+	const size_t groupingSize = grouping.size();
+	const size_t targetSize = targetVector.size();
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Reduce: Reduce multiplyResult to a new x vector
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// Get result back from the device
+	cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_groupedVector != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_groupedVector = nullptr;
+	}
+	if (device_grouping != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_grouping);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_grouping = nullptr;
+	}
+	if (device_target != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_target);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_target = nullptr;
+	}
+}
+
+template <typename ValueType, bool Relative>
+void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
+	ValueType* device_x = nullptr;
+	ValueType* device_y = nullptr;
+
+	const size_t vectorSize = x.size();
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Transform: x = abs(x - xSwap)/ xSwap
+		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
+		thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
+		thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Reduce: get Max over x and check for res < Precision
+		maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_x != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_x);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_x = nullptr;
+	}
+	if (device_y != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_y);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_y = nullptr;
+	}
+}
+
+/*
+ * Declare and implement all exported functions for these Kernels here
+ *
+ */
+
+void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
+	basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
+}
+
+void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
+	basicValueIteration_addVectorsInplace<double>(a, b);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
+	basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
+}
+
+void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
+	basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
+}
+
+// Float
+void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
+	basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
+}
+
+void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
+	basicValueIteration_addVectorsInplace<float>(a, b);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
+	basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
+}
+
+void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
+	basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
+}
+
+bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
+bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
+bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
+bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+	if (relativePrecisionCheck) {
+		return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	} else {
+		return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+	}
+}
+
+size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
+	size_t const valueTypeSize = sizeof(double);
+	size_t const indexTypeSize = sizeof(uint_fast64_t);
+
+	/*
+	IndexType* device_matrixRowIndices = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;
+	*/
+
+	// Row Indices, Column Indices, Values, Choice Indices
+	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
+	// Vectors x, xSwap, b, multiplyResult
+	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
+
+	return (matrixDataSize + vectorSizes);
+}
+
+size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
+	size_t const valueTypeSize = sizeof(float);
+	size_t const indexTypeSize = sizeof(uint_fast64_t);
+
+	/*
+	IndexType* device_matrixRowIndices = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_xSwap = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;
+	*/
+
+	// Row Indices, Column Indices, Values, Choice Indices
+	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
+	// Vectors x, xSwap, b, multiplyResult
+	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
+
+	return (matrixDataSize + vectorSizes);
+}
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -0,0 +1,107 @@
+#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
+#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
+
+#include <cstdint>
+#include <vector>
+#include <utility>
+
+// Library exports
+#include "cudaForStorm_Export.h"
+
+/* Helper declaration to cope with new internal format */
+#ifndef STORM_STORAGE_SPARSEMATRIX_H_
+namespace storm {
+	namespace storage {
+template<typename T>
+    class MatrixEntry {
+    public:
+        /*!
+            * Constructs a matrix entry with the given column and value.
+            *
+            * @param column The column of the matrix entry.
+            * @param value The value of the matrix entry.
+            */
+        MatrixEntry(uint_fast64_t column, T value);
+            
+        /*!
+            * Move-constructs the matrix entry fro the given column-value pair.
+            *
+            * @param pair The column-value pair from which to move-construct the matrix entry.
+            */
+        MatrixEntry(std::pair<uint_fast64_t, T>&& pair);
+            
+        //MatrixEntry() = default;
+        //MatrixEntry(MatrixEntry const& other) = default;
+        //MatrixEntry& operator=(MatrixEntry const& other) = default;
+#ifndef WINDOWS
+        //MatrixEntry(MatrixEntry&& other) = default;
+        //MatrixEntry& operator=(MatrixEntry&& other) = default;
+#endif
+            
+        /*!
+            * Retrieves the column of the matrix entry.
+            *
+            * @return The column of the matrix entry.
+            */
+        uint_fast64_t const& getColumn() const;
+            
+        /*!
+            * Retrieves the column of the matrix entry.
+            *
+            * @return The column of the matrix entry.
+            */
+        uint_fast64_t& getColumn();
+            
+        /*!
+            * Retrieves the value of the matrix entry.
+            *
+            * @return The value of the matrix entry.
+            */
+        T const& getValue() const;
+
+        /*!
+            * Retrieves the value of the matrix entry.
+            *
+            * @return The value of the matrix entry.
+            */
+        T& getValue();
+            
+        /*!
+            * Retrieves a pair of column and value that characterizes this entry.
+            *
+            * @return A column-value pair that characterizes this entry.
+            */
+        std::pair<uint_fast64_t, T> const& getColumnValuePair() const;
+            
+    private:
+        // The actual matrix entry.
+        std::pair<uint_fast64_t, T> entry;
+    };
+
+	}
+}
+#endif
+
+cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+
+cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
+
+cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
+cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
+
+cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b);
+cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
+
+#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/resources/cudaForStorm/srcCuda/cudaForStorm.h
+++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h
@ -0,0 +1,19 @@
+#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_
+#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_
+
+/*
+ * List of exported functions in this library
+ */
+
+// TopologicalValueIteration
+#include "srcCuda/basicValueIteration.h"
+
+// Utility Functions
+#include "srcCuda/utility.h"
+
+// Version Information
+#include "srcCuda/version.h"
+
+
+
+#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -0,0 +1,49 @@
+#pragma once
+
+#include "cuspExtensionFloat.h"
+#include "cuspExtensionDouble.h"
+
+namespace cusp {
+namespace detail {
+namespace device {
+
+template <typename ValueType>
+void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) {
+	//
+	throw;
+}
+template <>
+void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) {
+	storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+template <>
+void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) {
+	storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+template <bool Minimize, typename ValueType>
+void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) {
+	//
+	throw;
+}
+template <>
+void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
+	storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
+}
+template <>
+void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
+	storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
+}
+
+template <>
+void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
+	storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
+}
+template <>
+void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
+	storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
@ -0,0 +1,361 @@
+/*
+ * This is an extension of the original CUSP csr_vector.h SPMV implementation.
+ * It is based on the Code and incorporates changes as to cope with the details
+ * of the StoRM code.
+ * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm
+ */
+
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <algorithm>
+
+#include <math_functions.h>
+
+#include <cusp/detail/device/spmv/csr_vector.h>
+
+namespace cusp
+{
+namespace detail
+{
+namespace device
+{
+
+//////////////////////////////////////////////////////////////////////////////
+// CSR SpMV kernels based on a vector model (one warp per row)
+//////////////////////////////////////////////////////////////////////////////
+//
+// spmv_csr_vector_device
+//   Each row of the CSR matrix is assigned to a warp.  The warp computes
+//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
+//   the x vector, in parallel.  This division of work implies that 
+//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
+//   manner (but generally not aligned).  On GT200 these accesses are
+//   coalesced, unlike kernels based on the one-row-per-thread division of 
+//   work.  Since an entire 32-thread warp is assigned to each row, many 
+//   threads will remain idle when their row contains a small number 
+//   of elements.  This code relies on implicit synchronization among 
+//   threads in a warp.
+//
+// spmv_csr_vector_tex_device
+//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
+//   used for accessing the x vector.
+//  
+//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
+
+
+template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
+__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
+__global__ void
+storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
+{
+    __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
+    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local sum
+        double sum = 0;
+     
+        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+			}
+				
+            // accumulate local sums
+            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+			}
+        } else {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
+			}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = sum;
+        
+        // reduce local sums to row sum
+        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
+        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
+        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
+        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
+        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            y[row] = sdata[threadIdx.x];
+    }
+}
+
+template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
+__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
+__global__ void
+storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
+{
+    __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
+    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local Min/Max
+        double localMinMaxElement = minMaxInitializer;
+
+        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+			}
+
+            // accumulate local sums
+            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+        else
+        {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = localMinMaxElement;
+        
+        // reduce local min/max to row min/max
+		if (Minimize) {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
+		} else {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
+			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
+			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
+			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
+		}
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            x[row] = sdata[threadIdx.x];
+    }
+}
+
+template <bool Minimize, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
+{
+	double __minMaxInitializer = -std::numeric_limits<double>::max();
+	if (Minimize) {
+		__minMaxInitializer = std::numeric_limits<double>::max();
+	}
+	const double minMaxInitializer = __minMaxInitializer;
+
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+
+    storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
+}
+
+template <bool Minimize>
+void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
+{
+    const uint_fast64_t rows_per_group = num_entries / num_rows;
+
+    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    
+    __storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+}
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    
+    __storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+
+    __storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+// NON-OPT
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    
+    __storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+
+    __storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
@ -0,0 +1,375 @@
+/*
+ * This is an extension of the original CUSP csr_vector.h SPMV implementation.
+ * It is based on the Code and incorporates changes as to cope with the details
+ * of the StoRM code.
+ * As this is mostly copy & paste, the original license still applies.
+ */
+
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <algorithm>
+
+#include <math_functions.h>
+
+#include <cusp/detail/device/spmv/csr_vector.h>
+
+#include "storm-cudaplugin-config.h"
+
+namespace cusp
+{
+namespace detail
+{
+namespace device
+{
+
+//////////////////////////////////////////////////////////////////////////////
+// CSR SpMV kernels based on a vector model (one warp per row)
+//////////////////////////////////////////////////////////////////////////////
+//
+// spmv_csr_vector_device
+//   Each row of the CSR matrix is assigned to a warp.  The warp computes
+//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
+//   the x vector, in parallel.  This division of work implies that 
+//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
+//   manner (but generally not aligned).  On GT200 these accesses are
+//   coalesced, unlike kernels based on the one-row-per-thread division of 
+//   work.  Since an entire 32-thread warp is assigned to each row, many 
+//   threads will remain idle when their row contains a small number 
+//   of elements.  This code relies on implicit synchronization among 
+//   threads in a warp.
+//
+// spmv_csr_vector_tex_device
+//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
+//   used for accessing the x vector.
+//  
+//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
+
+
+template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
+__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
+__global__ void
+storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
+{
+    __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
+    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local sum
+        float sum = 0;
+     
+        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+			}
+				
+            // accumulate local sums
+            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+			}
+        } else {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
+                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
+#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
+#else
+				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
+#endif
+			}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = sum;
+        
+        // reduce local sums to row sum
+        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
+        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
+        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
+        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
+        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            y[row] = sdata[threadIdx.x];
+    }
+}
+
+template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
+__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
+__global__ void
+storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
+{
+    __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
+    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
+    
+    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
+
+    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
+    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
+    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
+    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
+    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
+
+    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
+    {
+        // use two threads to fetch Ap[row] and Ap[row+1]
+        // this is considerably faster than the straightforward version
+        if(thread_lane < 2)
+            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
+
+        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
+        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
+
+        // initialize local Min/Max
+        float localMinMaxElement = minMaxInitializer;
+
+        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
+        {
+            // ensure aligned memory access to Aj and Ax
+
+            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
+
+            // accumulate local sums
+            if(jj >= row_start && jj < row_end) {
+				if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+			}
+
+            // accumulate local sums
+            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+        else
+        {
+            // accumulate local sums
+            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
+                if(Minimize) {
+					localMinMaxElement = min(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
+				} else {
+					localMinMaxElement = max(localMinMaxElement, y[jj]);
+					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
+				}
+        }
+
+        // store local sum in shared memory
+        sdata[threadIdx.x] = localMinMaxElement;
+        
+        // reduce local min/max to row min/max
+		if (Minimize) {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
+		} else {
+			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
+			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
+			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
+			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
+			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
+			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
+			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
+			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
+			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
+			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
+		}
+       
+        // first thread writes the result
+        if (thread_lane == 0)
+            x[row] = sdata[threadIdx.x];
+    }
+}
+
+template <bool Minimize, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
+{
+	float __minMaxInitializer = -std::numeric_limits<float>::max();
+	if (Minimize) {
+		__minMaxInitializer = std::numeric_limits<float>::max();
+	}
+	const float minMaxInitializer = __minMaxInitializer;
+
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+
+    storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
+}
+
+template <bool Minimize>
+void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
+{
+    const uint_fast64_t rows_per_group = num_entries / num_rows;
+
+    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
+    
+    __storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
+}
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    
+    __storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+
+    __storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+}
+
+// NON-OPT
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR>
+void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    
+    __storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
+{
+    const uint_fast64_t nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+
+    __storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
@ -0,0 +1,39 @@
+#include <iostream>
+#include <chrono>
+
+__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) {
+	*B = *A;
+}
+
+void kernelSwitchTest(size_t N) {
+	int* deviceIntA;
+	int* deviceIntB;
+
+	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
+		return;
+	}
+
+	// Allocate space on the device
+	auto start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
+	}
+	auto end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
+
+	// Free memory on device
+	if (cudaFree(deviceIntA) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+		return;
+	}
+	if (cudaFree(deviceIntB) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+		return;
+	}
+}
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
@ -0,0 +1 @@
+void kernelSwitchTest(size_t N);
--- a/resources/cudaForStorm/srcCuda/utility.cu
+++ b/resources/cudaForStorm/srcCuda/utility.cu
@ -0,0 +1,33 @@
+#include "utility.h"
+
+#include <cuda_runtime.h>
+
+size_t getFreeCudaMemory() {
+	size_t freeMemory;
+	size_t totalMemory;
+	cudaMemGetInfo(&freeMemory, &totalMemory);
+
+	return freeMemory;
+}
+
+size_t getTotalCudaMemory() {
+	size_t freeMemory;
+	size_t totalMemory;
+	cudaMemGetInfo(&freeMemory, &totalMemory);
+
+	return totalMemory;
+}
+
+bool resetCudaDevice() {
+	cudaError_t result = cudaDeviceReset();
+	return (result == cudaSuccess);
+}
+
+int getRuntimeCudaVersion() {
+	int result = -1;
+	cudaError_t errorResult = cudaRuntimeGetVersion(&result);
+	if (errorResult != cudaSuccess) {
+		return -1;
+	}
+	return result;
+}
--- a/resources/cudaForStorm/srcCuda/utility.h
+++ b/resources/cudaForStorm/srcCuda/utility.h
@ -0,0 +1,12 @@
+#ifndef STORM_CUDAFORSTORM_UTILITY_H_
+#define STORM_CUDAFORSTORM_UTILITY_H_
+
+// Library exports
+#include "cudaForStorm_Export.h"
+
+cudaForStorm_EXPORT size_t getFreeCudaMemory();
+cudaForStorm_EXPORT size_t getTotalCudaMemory();
+cudaForStorm_EXPORT bool resetCudaDevice();
+cudaForStorm_EXPORT int getRuntimeCudaVersion();
+
+#endif // STORM_CUDAFORSTORM_UTILITY_H_
--- a/resources/cudaForStorm/srcCuda/version.cu
+++ b/resources/cudaForStorm/srcCuda/version.cu
@ -0,0 +1,28 @@
+#include "version.h"
+
+#include "storm-cudaplugin-config.h"
+
+size_t getStormCudaPluginVersionMajor() {
+	return STORM_CUDAPLUGIN_VERSION_MAJOR;
+}
+
+size_t getStormCudaPluginVersionMinor() {
+	return STORM_CUDAPLUGIN_VERSION_MINOR;
+}
+
+size_t getStormCudaPluginVersionPatch() {
+	return STORM_CUDAPLUGIN_VERSION_PATCH;
+}
+
+size_t getStormCudaPluginVersionCommitsAhead() {
+	return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD;
+}
+
+const char* getStormCudaPluginVersionHash() {
+	static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH;
+	return versionHash.c_str();
+}
+
+bool getStormCudaPluginVersionIsDirty() {
+	return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0);
+}
--- a/resources/cudaForStorm/srcCuda/version.h
+++ b/resources/cudaForStorm/srcCuda/version.h
@ -0,0 +1,16 @@
+#ifndef STORM_CUDAFORSTORM_VERSION_H_
+#define STORM_CUDAFORSTORM_VERSION_H_
+
+// Library exports
+#include "cudaForStorm_Export.h"
+
+#include <string>
+
+cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor();
+cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor();
+cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch();
+cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead();
+cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash();
+cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty();
+
+#endif // STORM_CUDAFORSTORM_VERSION_H_
--- a/resources/cudaForStorm/storm-cudaplugin-config.h.in
+++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in
@ -0,0 +1,21 @@
+/*
+ * StoRM - Build-in Options
+ *
+ * This file is parsed by CMake during makefile generation
+ */
+
+#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
+#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
+
+// Version Information
+#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@					// The major version of StoRM
+#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@					// The minor version of StoRM
+#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@					// The patch version of StoRM
+#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ 	// How many commits passed since the tag was last set
+#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" 					// The short hash of the git commit this build is bases on
+#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ 					// 0 iff there no files were modified in the checkout, 1 else
+
+// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit
+#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
+
+#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
--- a/src/counterexamples/PathBasedSubsystemGenerator.h
+++ b/src/counterexamples/PathBasedSubsystemGenerator.h
@ -430,7 +430,7 @@ public:
 			allowedStates = storm::storage::BitVector(targetStates.size(), true);
 		}
 		else if(globally.get() != nullptr){
-			//eventually reaching a state without property visiting only states with property
+			// eventually reaching a state without property visiting only states with property
 			allowedStates = globally->getChild()->check(modelCheck);
 			targetStates = storm::storage::BitVector(allowedStates);
 			targetStates.complement();
@ -451,9 +451,9 @@ public:
 		// estimate the path count using the models state count as well as the probability bound
 		uint_fast8_t const minPrec = 10;
 		uint_fast64_t const stateCount = model.getNumberOfStates();
-		uint_fast64_t const stateEstimate = static_cast<uint_fast64_t>(stateCount * bound);
+		uint_fast64_t const stateEstimate = static_cast<uint_fast64_t>((static_cast<T>(stateCount)) * bound);

-		//since this only has a good effect on big models -> use only if model has at least 10^5 states
+		// since this only has a good effect on big models -> use only if model has at least 10^5 states
 		uint_fast64_t precision = stateEstimate > 100000 ? stateEstimate/1000 : minPrec;


@ -546,11 +546,11 @@ public:
 				//std::cout << "Diff: " << diff << std::endl;
 				//std::cout << "Path count: " << pathCount << std::endl;

-				//Are we critical?
+				// Are we critical?
 				if(subSysProb >= bound){
 					break;
 				} else if (stateEstimate > 100000){
-					precision = static_cast<uint_fast64_t>((stateEstimate / 1000.0) - ((stateEstimate / 1000.0) - minPrec)*(subSysProb/bound));
+					precision = static_cast<uint_fast64_t>((stateEstimate / 1000.0) - ((stateEstimate / 1000.0) - minPrec) * (subSysProb/bound));
 				}
 			}
 		}
--- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h
+++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h
@ -9,6 +9,7 @@
 #define STORM_MODELCHECKER_PRCTL_TOPOLOGICALVALUEITERATIONSMDPPRCTLMODELCHECKER_H_

 #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h"
+#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"
 #include "src/exceptions/InvalidPropertyException.h"
 #include <cmath>

@ -29,7 +30,7 @@ public:
 	 *
 	 * @param model The MDP to be checked.
 	 */
-	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp<Type> const& model) : SparseMdpPrctlModelChecker<Type>(model) {
+	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp<Type> const& model) : SparseMdpPrctlModelChecker<Type>(model, std::shared_ptr<storm::solver::NondeterministicLinearEquationSolver<Type>>(new storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver<Type>())) {
 		// Intentionally left empty.
 	}

@ -37,8 +38,8 @@ public:
 	 * Copy constructs a SparseMdpPrctlModelChecker from the given model checker. In particular, this means that the newly
 	 * constructed model checker will have the model of the given model checker as its associated model.
 	 */
-	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<Type> const& modelchecker)
-		: SparseMdpPrctlModelChecker<Type>(modelchecker),  minimumOperatorStack() {
+	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<Type> const& modelchecker)
+    : SparseMdpPrctlModelChecker<Type>(modelchecker) {
 		// Intentionally left empty.
 	}

@ -46,100 +47,6 @@ public:
 	 * Virtual destructor. Needs to be virtual, because this class has virtual methods.
 	 */
 	virtual ~TopologicalValueIterationMdpPrctlModelChecker() { }
-
-private:
-	/*!
-	 * Solves the given equation system under the given parameters using the power method.
-	 *
-	 * @param A The matrix A specifying the coefficients of the equations.
-	 * @param x The vector x for which to solve the equations. The initial value of the elements of
-	 * this vector are used as the initial guess and might thus influence performance and convergence.
-	 * @param b The vector b specifying the values on the right-hand-sides of the equations.
-	 * @return The solution of the system of linear equations in form of the elements of the vector
-	 * x.
-	 */
-	void solveEquationSystem(storm::storage::SparseMatrix<Type> const& matrix, std::vector<Type>& x, std::vector<Type> const& b) const {
-		// Get the settings object to customize solving.
-		storm::settings::Settings* s = storm::settings::Settings::getInstance();
-
-		// Get relevant user-defined settings for solving the equations.
-		double precision = s->getOptionByLongName("precision").getArgument(0).getValueAsDouble();
-		uint_fast64_t maxIterations = s->getOptionByLongName("maxIterations").getArgument(0).getValueAsUnsignedInteger();
-		bool relative = s->getOptionByLongName("relative").getArgument(0).getValueAsBoolean();
-
-		// Now, we need to determine the SCCs of the MDP and a topological sort.
-        std::vector<std::vector<uint_fast64_t>> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph);
-        storm::storage::SparseMatrix<T> stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents);
-		std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
-
-		// Set up the environment for the power method.
-		std::vector<Type> multiplyResult(matrix.getRowCount());
-		std::vector<Type>* currentX = &x;
-		std::vector<Type>* newX = new std::vector<Type>(x.size());
-		std::vector<Type>* swap = nullptr;
-		uint_fast64_t currentMaxLocalIterations = 0;
-		uint_fast64_t localIterations = 0;
-		uint_fast64_t globalIterations = 0;
-		bool converged = true;
-
-		// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
-		// solved after all SCCs it depends on have been solved.
-		for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) {
-			std::vector<uint_fast64_t> const& scc = stronglyConnectedComponents[*sccIndexIt];
-
-			// For the current SCC, we need to perform value iteration until convergence.
-			localIterations = 0;
-			converged = false;
-			while (!converged && localIterations < maxIterations) {
-				// Compute x' = A*x + b.
-				matrix.multiplyWithVector(scc, *currentX, multiplyResult);
-				storm::utility::addVectors(scc, matrix.getRowGroupIndices(), multiplyResult, b);
-
-				// Reduce the vector x' by applying min/max for all non-deterministic choices.
-				if (this->minimumOperatorStack.top()) {
-					storm::utility::reduceVectorMin(multiplyResult, newX, scc, matrix.getRowGroupIndices());
-				} else {
-					storm::utility::reduceVectorMax(multiplyResult, newX, scc, matrix.getRowGroupIndices());
-				}
-
-				// Determine whether the method converged.
-				// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
-				// running time. In fact, it is faster. This has to be investigated.
-				// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
-				converged = storm::utility::equalModuloPrecision(*currentX, *newX, precision, relative);
-
-				// Update environment variables.
-				swap = currentX;
-				currentX = newX;
-				newX = swap;
-				++localIterations;
-				++globalIterations;
-			}
-
-			// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
-			// track of the maximum.
-			if (localIterations > currentMaxLocalIterations) {
-				currentMaxLocalIterations = localIterations;
-			}
-		}
-
-		// If we performed an odd number of global iterations, we need to swap the x and currentX, because the newest
-		// result is currently stored in currentX, but x is the output vector.
-		// TODO: Check whether this is correct or should be put into the for-loop over SCCs.
-		if (globalIterations % 2 == 1) {
-			std::swap(x, *currentX);
-			delete currentX;
-		} else {
-			delete newX;
-		}
-
-		// Check if the solver converged and issue a warning otherwise.
-		if (converged) {
-			LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
-		} else {
-			LOG4CPLUS_WARN(logger, "Iterative solver did not converge.");
-		}
-	}
 };

 } // namespace prctl
--- a/src/models/PseudoModel.cpp
+++ b/src/models/PseudoModel.cpp
@ -0,0 +1,111 @@
+#include "src/models/PseudoModel.h"
+#include "src/utility/constants.h"
+#include "src/models/AbstractModel.h"
+
+namespace storm {
+	namespace models {
+
+		template<typename ValueType>
+		ModelBasedPseudoModel<ValueType>::ModelBasedPseudoModel(storm::models::AbstractModel<ValueType> const& model) : _model(model) {
+			// Intentionally left empty.
+		}
+
+		template<typename ValueType>
+		NonDeterministicMatrixBasedPseudoModel<ValueType>::NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) : _matrix(matrix), _nondeterministicChoiceIndices(nondeterministicChoiceIndices) {
+			// Intentionally left empty.
+		}
+
+		template<typename ValueType>
+		DeterministicMatrixBasedPseudoModel<ValueType>::DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix) : _matrix(matrix) {
+			// Intentionally left empty.
+		}
+
+		template<typename ValueType>
+		typename storm::storage::SparseMatrix<ValueType>::const_rows
+		ModelBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const {
+			return this->_model.getRows(state);
+		}
+
+		template<typename ValueType>
+		typename storm::storage::SparseMatrix<ValueType>::const_rows
+		NonDeterministicMatrixBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const {
+			return this->_matrix.getRows(this->_nondeterministicChoiceIndices[state], this->_nondeterministicChoiceIndices[state + 1] - 1);
+		}
+
+		template<typename ValueType>
+		typename storm::storage::SparseMatrix<ValueType>::const_rows
+		DeterministicMatrixBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const {
+			return this->_matrix.getRows(state, state);
+		}
+
+		template<typename ValueType>
+		uint_fast64_t
+		ModelBasedPseudoModel<ValueType>::getNumberOfStates() const {
+			return this->_model.getNumberOfStates();
+		}
+
+		template<typename ValueType>
+		uint_fast64_t
+		NonDeterministicMatrixBasedPseudoModel<ValueType>::getNumberOfStates() const {
+			return this->_matrix.getColumnCount();
+		}
+
+		template<typename ValueType>
+		uint_fast64_t
+		DeterministicMatrixBasedPseudoModel<ValueType>::getNumberOfStates() const {
+			return this->_matrix.getColumnCount();
+		}
+
+		template<typename ValueType>
+		storm::storage::SparseMatrix<ValueType>
+		AbstractPseudoModel<ValueType>::extractPartitionDependencyGraph(storm::storage::Decomposition<storm::storage::StateBlock> const& decomposition) const {
+			uint_fast64_t numberOfStates = decomposition.size();
+
+			// First, we need to create a mapping of states to their SCC index, to ease the computation of dependency transitions later.
+			std::vector<uint_fast64_t> stateToBlockMap(this->getNumberOfStates());
+			for (uint_fast64_t i = 0; i < decomposition.size(); ++i) {
+				for (auto state : decomposition[i]) {
+					stateToBlockMap[state] = i;
+				}
+			}
+
+			// The resulting sparse matrix will have as many rows/columns as there are blocks in the partition.
+			storm::storage::SparseMatrixBuilder<ValueType> dependencyGraphBuilder(numberOfStates, numberOfStates);
+
+			for (uint_fast64_t currentBlockIndex = 0; currentBlockIndex < decomposition.size(); ++currentBlockIndex) {
+				// Get the next block.
+				typename storm::storage::StateBlock const& block = decomposition[currentBlockIndex];
+
+				// Now, we determine the blocks which are reachable (in one step) from the current block.
+				boost::container::flat_set<uint_fast64_t> allTargetBlocks;
+				for (auto state : block) {
+					for (auto const& transitionEntry : this->getRows(state)) {
+						uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.getColumn()];
+
+						// We only need to consider transitions that are actually leaving the SCC.
+						if (targetBlock != currentBlockIndex) {
+							allTargetBlocks.insert(targetBlock);
+						}
+					}
+				}
+
+				// Now we can just enumerate all the target SCCs and insert the corresponding transitions.
+				for (auto targetBlock : allTargetBlocks) {
+					dependencyGraphBuilder.addNextValue(currentBlockIndex, targetBlock, storm::utility::constantOne<ValueType>());
+				}
+			}
+
+			return dependencyGraphBuilder.build();
+		}
+
+		template class ModelBasedPseudoModel<double>;
+		template class NonDeterministicMatrixBasedPseudoModel<double>;
+		template class DeterministicMatrixBasedPseudoModel<double>;
+		template class ModelBasedPseudoModel <float> ;
+		template class NonDeterministicMatrixBasedPseudoModel <float>;
+		template class DeterministicMatrixBasedPseudoModel <float>;
+		template class ModelBasedPseudoModel<int>;
+		template class NonDeterministicMatrixBasedPseudoModel<int>;
+		template class DeterministicMatrixBasedPseudoModel<int>;
+	} // namespace models
+} // namespace storm
--- a/src/models/PseudoModel.h
+++ b/src/models/PseudoModel.h
@ -0,0 +1,90 @@
+#ifndef STORM_MODELS_PSEUDOMODEL_H_
+#define STORM_MODELS_PSEUDOMODEL_H_
+
+#include <cstdint> 
+#include "src/storage/SparseMatrix.h"
+#include "src/storage/Decomposition.h"
+
+namespace storm {
+	namespace models {
+		// Forward declare the abstract model class.
+		template <typename ValueType> class AbstractModel;
+
+		/*!
+		* This classes encapsulate the model/transitionmatrix on which the SCC decomposition is performed.
+		* The Abstract Base class is specialized by the two possible representations:
+		* - For a model the implementation ModelBasedPseudoModel hands the call to getRows() through to the model
+		* - For a matrix of a nondeterministic model the implementation NonDeterministicMatrixBasedPseudoModel emulates the call
+		*   on the matrix itself like the model function would
+		* - For a matrix of a deterministic model the implementation DeterministicMatrixBasedPseudoModel emulates the call
+		*   on the matrix itself like the model function would
+		*/
+		template <typename ValueType>
+		class AbstractPseudoModel {
+		public:
+			AbstractPseudoModel() {}
+			virtual ~AbstractPseudoModel() {}
+			
+			virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const = 0;
+			
+			/*!
+			 *	Calculates the number of states in the represented system.	
+			 *	@return The Number of States in the underlying model/transition matrix
+			 */
+			virtual uint_fast64_t getNumberOfStates() const = 0;
+
+			/*!
+			 * Extracts the dependency graph from a (pseudo-) model according to the given partition.
+			 *
+			 * @param decomposition A decomposition containing the blocks of the partition of the system.
+			 * @return A sparse matrix with bool entries that represents the dependency graph of the blocks of the partition.
+			 */
+			virtual storm::storage::SparseMatrix<ValueType> extractPartitionDependencyGraph(storm::storage::Decomposition<storm::storage::StateBlock> const& decomposition) const;
+		};
+
+		template <typename ValueType>
+		class ModelBasedPseudoModel : public AbstractPseudoModel<ValueType> {
+		public:
+			/*!
+			* Creates an encapsulation for the SCC decomposition based on a model
+			* @param model The Model on which the decomposition is to be performed
+			*/
+			ModelBasedPseudoModel(storm::models::AbstractModel<ValueType> const& model);
+			virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override;
+			virtual uint_fast64_t getNumberOfStates() const override;
+		private:
+			storm::models::AbstractModel<ValueType> const& _model;
+		};
+
+		template <typename ValueType>
+		class NonDeterministicMatrixBasedPseudoModel : public AbstractPseudoModel<ValueType> {
+		public:
+			/*!
+			* Creates an encapsulation for the SCC decomposition based on a matrix
+			* @param matrix The Matrix on which the decomposition is to be performed
+			*/
+			NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
+			virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override;
+			virtual uint_fast64_t getNumberOfStates() const override;
+		private:
+			storm::storage::SparseMatrix<ValueType> const& _matrix;
+			std::vector<uint_fast64_t> const& _nondeterministicChoiceIndices;
+		};
+
+		template <typename ValueType>
+		class DeterministicMatrixBasedPseudoModel : public AbstractPseudoModel<ValueType> {
+		public:
+			/*!
+			* Creates an encapsulation for the SCC decomposition based on a matrix
+			* @param matrix The Matrix on which the decomposition is to be performed
+			*/
+			DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix);
+			virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override;
+			virtual uint_fast64_t getNumberOfStates() const override;
+		private:
+			storm::storage::SparseMatrix<ValueType> const& _matrix;
+		};
+	}
+}
+
+#endif // STORM_MODELS_PSEUDOMODEL_H_
--- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp
@ -66,7 +66,7 @@ namespace storm {
                }
                
                // Determine whether the method converged.
-                converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative);
+                converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *newX, precision, relative);
                
                // Update environment variables.
                std::swap(currentX, newX);
@ -130,5 +130,6 @@ namespace storm {
        
        // Explicitly instantiate the solver.
        template class NativeNondeterministicLinearEquationSolver<double>;
+		template class NativeNondeterministicLinearEquationSolver<float>;
    } // namespace solver
 } // namespace storm
--- a/src/solver/NativeNondeterministicLinearEquationSolver.h
+++ b/src/solver/NativeNondeterministicLinearEquationSolver.h
@ -34,7 +34,7 @@ namespace storm {
            
            virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;

-        private:
+        protected:
            // The required precision for the iterative methods.
            double precision;
            
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -0,0 +1,467 @@
+#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"
+
+#include <utility>
+#include <chrono>
+
+#include "src/settings/Settings.h"
+#include "src/utility/vector.h"
+#include "src/utility/graph.h"
+#include "src/models/PseudoModel.h"
+#include "src/storage/StronglyConnectedComponentDecomposition.h"
+#include "src/exceptions/IllegalArgumentException.h"
+#include "src/exceptions/InvalidStateException.h"
+
+#include "log4cplus/logger.h"
+#include "log4cplus/loggingmacros.h"
+extern log4cplus::Logger logger;
+
+#include "storm-config.h"
+#ifdef STORM_HAVE_CUDAFORSTORM
+#	include "cudaForStorm.h"
+#endif
+
+namespace storm {
+    namespace solver {
+        
+        template<typename ValueType>
+		TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::TopologicalValueIterationNondeterministicLinearEquationSolver() {
+			// Get the settings object to customize solving.
+			storm::settings::Settings* settings = storm::settings::Settings::getInstance();
+
+			// Get appropriate settings.
+			this->maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger();
+			this->precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble();
+			this->relative = !settings->isSet("absolute");
+        }
+        
+        template<typename ValueType>
+		TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver<ValueType>(precision, maximalNumberOfIterations, relative) {
+            // Intentionally left empty.
+        }
+        
+        template<typename ValueType>
+		NondeterministicLinearEquationSolver<ValueType>* TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::clone() const {
+			return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this);
+        }
+        
+        template<typename ValueType>
+		void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
+			
+#ifdef GPU_USE_FLOAT
+#define __FORCE_FLOAT_CALCULATION true
+#else
+#define __FORCE_FLOAT_CALCULATION false
+#endif
+			if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) {
+				TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
+
+				storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
+				std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
+				std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
+
+				tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
+
+				for (size_t i = 0, size = new_x.size(); i < size; ++i) {
+					x.at(i) = new_x.at(i);
+				}
+				return;
+			}
+			
+			// For testing only
+			if (sizeof(ValueType) == sizeof(double)) {
+				std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
+				LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>");
+			} else {
+				std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
+				LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>");
+			}
+
+			// Now, we need to determine the SCCs of the MDP and perform a topological sort.
+			std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
+			storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> const pseudoModel(A, nondeterministicChoiceIndices);
+			
+			// Check if the decomposition is necessary
+#ifdef STORM_HAVE_CUDAFORSTORM
+#define __USE_CUDAFORSTORM_OPT true
+			size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A.getEntryCount()));
+			size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
+#else
+#define __USE_CUDAFORSTORM_OPT false
+			size_t const gpuSizeOfCompleteSystem = 0;
+			size_t const cudaFreeMemory = 0;
+#endif
+			std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition;
+			if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) {
+				// Dummy output for SCC Times
+				std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+				if (!resetCudaDevice()) {
+					LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+					throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
+				}
+
+				std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
+				bool result = false;
+				size_t globalIterations = 0;
+				if (minimize) {
+					result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
+				} else {
+					result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
+				}
+				LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
+
+				bool converged = false;
+				if (!result) {
+					converged = false;
+					LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
+					throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
+				} else {
+					converged = true;
+				}
+
+				std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
+
+				std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl;
+
+				// Check if the solver converged and issue a warning otherwise.
+				if (converged) {
+					LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations.");
+				} else {
+					LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations.");
+				}
+#else
+				LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
+				throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+#endif
+			} else {
+				std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now();
+				storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
+
+				if (sccDecomposition.size() == 0) {
+					LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
+					throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
+				}
+
+				storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
+				std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
+
+				// Calculate the optimal distribution of sccs
+				std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
+				LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
+
+				std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast<std::chrono::milliseconds>(sccEndTime - sccStartTime).count() << "ms." << std::endl;
+
+				std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
+
+				std::vector<ValueType>* currentX = nullptr;
+				std::vector<ValueType>* swap = nullptr;
+				size_t currentMaxLocalIterations = 0;
+				size_t localIterations = 0;
+				size_t globalIterations = 0;
+				bool converged = true;
+
+				// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
+				// solved after all SCCs it depends on have been solved.
+				int counter = 0;
+
+				for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
+					bool const useGpu = sccIndexIt->first;
+					storm::storage::StateBlock const& scc = sccIndexIt->second;
+
+					// Generate a sub matrix
+					storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
+					storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
+					std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
+					storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
+					std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
+					std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
+					std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
+
+					// Prepare the pointers for swapping in the calculation
+					currentX = &sccSubX;
+					swap = &sccSubXSwap;
+
+					storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
+					std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
+					sccSubNondeterministicChoiceIndices.at(0) = 0;
+
+					// Pre-process all dependent states
+					// Remove outgoing transitions and create the ChoiceIndices
+					uint_fast64_t innerIndex = 0;
+					uint_fast64_t outerIndex = 0;
+					for (uint_fast64_t state : scc) {
+						// Choice Indices
+						sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
+
+						for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
+							typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt);
+							for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
+								if (!subMatrixIndices.get(rowIt->getColumn())) {
+									// This is an outgoing transition of a state in the SCC to a state not included in the SCC
+									// Subtracting Pr(tau) * x_other from b fixes that
+									sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
+								}
+							}
+							++innerIndex;
+						}
+						++outerIndex;
+					}
+
+					// For the current SCC, we need to perform value iteration until convergence.
+					if (useGpu) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+						if (!resetCudaDevice()) {
+							LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
+							throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
+						}
+
+						//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
+						//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
+						//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
+
+						bool result = false;
+						localIterations = 0;
+						if (minimize) {
+							result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+						} else {
+							result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
+						}
+						LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
+
+						if (!result) {
+							converged = false;
+							LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
+							throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
+						} else {
+							converged = true;
+						}
+
+						// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+						// track of the maximum.
+						if (localIterations > currentMaxLocalIterations) {
+							currentMaxLocalIterations = localIterations;
+						}
+						globalIterations += localIterations;
+#else
+						LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
+						throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
+#endif
+					} else {
+						std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl;
+						localIterations = 0;
+						converged = false;
+						while (!converged && localIterations < this->maximalNumberOfIterations) {
+							// Compute x' = A*x + b.
+							sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
+							storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
+
+							//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
+							//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
+
+							/*
+							Versus:
+							A.multiplyWithVector(*currentX, *multiplyResult);
+							storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
+							*/
+
+							// Reduce the vector x' by applying min/max for all non-deterministic choices.
+							if (minimize) {
+								storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+							} else {
+								storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
+							}
+
+							// Determine whether the method converged.
+							// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
+							// running time. In fact, it is faster. This has to be investigated.
+							// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
+							converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
+
+							// Update environment variables.
+							std::swap(currentX, swap);
+
+							++localIterations;
+							++globalIterations;
+						}
+						LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
+					}
+
+
+					// The Result of this SCC has to be taken back into the main result vector
+					innerIndex = 0;
+					for (uint_fast64_t state : scc) {
+						x.at(state) = currentX->at(innerIndex);
+						++innerIndex;
+					}
+
+					// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
+					currentX = nullptr;
+					swap = nullptr;
+
+					// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
+					// track of the maximum.
+					if (localIterations > currentMaxLocalIterations) {
+						currentMaxLocalIterations = localIterations;
+					}
+				}
+
+				std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl;
+
+				// Check if the solver converged and issue a warning otherwise.
+				if (converged) {
+					LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
+				} else {
+					LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
+				}
+
+				std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
+				std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
+			}
+        }
+
+		template<typename ValueType>
+		std::vector<std::pair<bool, storm::storage::StateBlock>>
+			TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const {
+				std::vector<std::pair<bool, storm::storage::StateBlock>> result;
+#ifdef STORM_HAVE_CUDAFORSTORM
+				// 95% to have a bit of padding
+				size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
+				size_t lastResultIndex = 0;
+
+				std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices();
+
+				size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount()));
+				size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1));
+				size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup;
+
+				size_t currentSize = 0;
+				size_t neededReserveSize = 0;
+				size_t startIndex = 0;
+				for (size_t i = 0; i < topologicalSort.size(); ++i) {
+					storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]];
+					size_t const currentSccSize = scc.size();
+
+					uint_fast64_t rowCount = 0;
+					uint_fast64_t entryCount = 0;
+
+					for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) {
+						rowCount += matrix.getRowGroupSize(*sccIt);
+						entryCount += matrix.getRowGroupEntryCount(*sccIt);
+					}
+
+					size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount));
+
+					if ((currentSize + sccSize) <= cudaFreeMemory) {
+						// There is enough space left in the current group
+						neededReserveSize += currentSccSize;
+						currentSize += sccSize;
+					} else {
+						// This would make the last open group to big for the GPU
+
+						if (startIndex < i) {
+							if ((startIndex + 1) < i) {
+								// More than one component
+								std::vector<uint_fast64_t> tempGroups;
+								tempGroups.reserve(neededReserveSize);
+
+								// Copy the first group to make inplace_merge possible
+								storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
+								tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
+
+								if (((startIndex + 1) + 80) >= i) {
+									size_t lastSize = 0;
+									for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
+										storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+										lastSize = tempGroups.size();
+										tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
+										std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
+										std::advance(middleIterator, lastSize);
+										std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
+									}
+								} else {
+									// Use std::sort
+									for (size_t j = startIndex + 1; j < i; ++j) {
+										storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+										tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
+									}
+									std::sort(tempGroups.begin(), tempGroups.end());
+								}
+								result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
+							} else {
+								// Only one group, copy construct.
+								result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
+							}
+							++lastResultIndex;
+						}
+
+						if (sccSize <= cudaFreeMemory) {
+							currentSize = sccSize;
+							neededReserveSize = currentSccSize;
+							startIndex = i;
+						} else {
+							// This group is too big to fit into the CUDA Memory by itself
+							result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]]))));
+							++lastResultIndex;
+
+							currentSize = 0;
+							neededReserveSize = 0;
+							startIndex = i + 1;
+						}
+					}
+				}
+
+				size_t const topologicalSortSize = topologicalSort.size();
+				if (startIndex < topologicalSortSize) {
+					if ((startIndex + 1) < topologicalSortSize) {
+						// More than one component
+						std::vector<uint_fast64_t> tempGroups;
+						tempGroups.reserve(neededReserveSize);
+						
+						// Copy the first group to make inplace_merge possible
+						storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]];
+						tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend());
+
+						// For set counts <= 80, Inplace Merge is faster
+						if (((startIndex + 1) + 80) >= topologicalSortSize) {
+							size_t lastSize = 0;
+							for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
+								storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+								lastSize = tempGroups.size();
+								tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
+								std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin();
+								std::advance(middleIterator, lastSize);
+								std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end());
+							}
+						} else {
+							// Use std::sort
+							for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) {
+								storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]];
+								tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend());
+							}
+							std::sort(tempGroups.begin(), tempGroups.end());
+						}						
+						result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend())));
+					}
+					else {
+						// Only one group, copy construct.
+						result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]]))));
+					}
+					++lastResultIndex;
+				}
+#else
+				for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) {
+					storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
+					result.push_back(std::make_pair(false, scc));
+				}
+#endif
+			return result;
+		}
+
+        // Explicitly instantiate the solver.
+		template class TopologicalValueIterationNondeterministicLinearEquationSolver<double>;
+		template class TopologicalValueIterationNondeterministicLinearEquationSolver<float>;
+    } // namespace solver
+} // namespace storm
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
@ -0,0 +1,97 @@
+#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_
+#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_
+
+#include "src/solver/NativeNondeterministicLinearEquationSolver.h"
+#include "src/storage/StronglyConnectedComponentDecomposition.h"
+#include "src/storage/SparseMatrix.h"
+
+#include <utility>
+#include <vector>
+
+#include "storm-config.h"
+#ifdef STORM_HAVE_CUDAFORSTORM
+#	include "cudaForStorm.h"
+#endif
+
+namespace storm {
+    namespace solver {
+        
+        /*!
+         * A class that uses SCC Decompositions to solve a linear equation system
+         */
+        template<class ValueType>
+		class TopologicalValueIterationNondeterministicLinearEquationSolver : public NativeNondeterministicLinearEquationSolver<ValueType> {
+        public:
+            /*!
+             * Constructs a nondeterministic linear equation solver with parameters being set according to the settings
+             * object.
+             */
+			TopologicalValueIterationNondeterministicLinearEquationSolver();
+            
+            /*!
+             * Constructs a nondeterminstic linear equation solver with the given parameters.
+             *
+             * @param precision The precision to use for convergence detection.
+             * @param maximalNumberOfIterations The maximal number of iterations do perform before iteration is aborted.
+             * @param relative If set, the relative error rather than the absolute error is considered for convergence
+             * detection.
+             */
+			TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative = true);
+            
+            virtual NondeterministicLinearEquationSolver<ValueType>* clone() const override;
+            
+            virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
+		private:
+			/*!
+			 * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
+			 */
+			std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
+        };
+
+		template <typename ValueType>
+		bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+			//
+			throw;
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_minimize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_minimize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+
+		template <typename ValueType>
+		bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+			//
+			throw;
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_maximize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+		template <>
+		inline bool __basicValueIteration_mvReduce_uint64_maximize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
+#ifdef STORM_HAVE_CUDAFORSTORM
+			return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
+#else
+			throw;
+#endif
+		}
+    } // namespace solver
+} // namespace storm
+
+#endif /* STORM_SOLVER_NATIVENONDETERMINISTICLINEAREQUATIONSOLVER_H_ */
--- a/src/utility/graph.h
+++ b/src/utility/graph.h
@ -128,7 +128,7 @@ namespace storm {
                uint_fast64_t numberOfStates = phiStates.size();
                storm::storage::BitVector statesWithProbabilityGreater0(numberOfStates);
                
-                // Add all psi states as the already satisfy the condition.
+                // Add all psi states as they already satisfy the condition.
                statesWithProbabilityGreater0 |= psiStates;
                
                // Initialize the stack used for the DFS with the states.
@ -667,7 +667,7 @@ namespace storm {
                    LOG4CPLUS_ERROR(logger, "Provided matrix is required to be square.");
                    throw storm::exceptions::InvalidArgumentException() << "Provided matrix is required to be square.";
                }
-                
+
                uint_fast64_t numberOfStates = matrix.getRowCount();
                
                // Prepare the result. This relies on the matrix being square.
@ -696,12 +696,12 @@ namespace storm {
                            
                        recursionStepBackward:
                            for (; successorIterator != matrix.end(currentState); ++successorIterator) {
-                                if (!visitedStates.get(successorIterator.first)) {
+                                if (!visitedStates.get(successorIterator->getColumn())) {
                                    // Put unvisited successor on top of our recursion stack and remember that.
-                                    recursionStack.push_back(successorIterator.first);
+									recursionStack.push_back(successorIterator->getColumn());
                                    
                                    // Also, put initial value for iterator on corresponding recursion stack.
-                                    iteratorRecursionStack.push_back(matrix.begin(successorIterator.first));
+									iteratorRecursionStack.push_back(matrix.begin(successorIterator->getColumn()));
                                    
                                    goto recursionStepForward;
                                }
--- a/src/utility/vector.h
+++ b/src/utility/vector.h
@ -329,7 +329,10 @@ namespace storm {
            template<class T>
            bool equalModuloPrecision(T const& val1, T const& val2, T precision, bool relativeError = true) {
                if (relativeError) {
-                    if (std::abs(val1 - val2)/val2 > precision) return false;
+					if (val2 == 0) {
+						return (std::abs(val1) <= precision);
+					}
+                    if (std::abs((val1 - val2)/val2) > precision) return false;
                } else {
                    if (std::abs(val1 - val2) > precision) return false;
                }
@ -419,6 +422,20 @@ namespace storm {
                
                return subVector;
            }
+
+			/*!
+			* Converts the given vector to the given ValueType
+			*/
+			template<typename NewValueType, typename ValueType>
+			std::vector<NewValueType> toValueType(std::vector<ValueType> const& oldVector) {
+				std::vector<NewValueType> resultVector;
+				resultVector.resize(oldVector.size());
+				for (size_t i = 0, size = oldVector.size(); i < size; ++i) {
+					resultVector.at(i) = static_cast<NewValueType>(oldVector.at(i));
+				}
+
+				return resultVector;
+			}
        } // namespace vector
    } // namespace utility
 } // namespace storm
--- a/storm-config.h.in
+++ b/storm-config.h.in
@ -23,6 +23,9 @@
 // Whether GLPK is available and to be used (define/undef)
 #@STORM_CPP_GLPK_DEF@ STORM_HAVE_GLPK

+// Whether CudaForStorm is available and to be used (define/undef)
+#@STORM_CPP_CUDAFORSTORM_DEF@ STORM_HAVE_CUDAFORSTORM
+
 // Whether Z3 is available and to be used (define/undef)
 #@STORM_CPP_Z3_DEF@ STORM_HAVE_Z3

--- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
+++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -0,0 +1,240 @@
+#include "gtest/gtest.h"
+#include "storm-config.h"
+
+#include "src/solver/NativeNondeterministicLinearEquationSolver.h"
+#include "src/settings/Settings.h"
+#include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h"
+#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h"
+#include "src/parser/AutoParser.h"
+
+#include "storm-config.h"
+
+TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
+	storm::settings::Settings* s = storm::settings::Settings::getInstance();    
+	std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as<storm::models::Mdp<double>>();
+    
+	ASSERT_EQ(mdp->getNumberOfStates(), 169ull);
+	ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull);
+    
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+    
+    storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("two");
+	storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+    
+	std::vector<double> result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("two");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("three");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("three");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("four");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("four");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    
+	delete probFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+    
+	result = mc.checkNoBoundOperator(*rewardFormula);
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+    
+	result = mc.checkNoBoundOperator(*rewardFormula);;
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+	std::shared_ptr<storm::models::Mdp<double>> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as<storm::models::Mdp<double>>();
+    
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateRewardModelChecker(*stateRewardMdp);
+
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+    
+	result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula);
+   
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+    
+	result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula);
+    
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+	std::shared_ptr<storm::models::Mdp<double>> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as<storm::models::Mdp<double>>();
+    
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp);
+    
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+    
+	result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula);
+    
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 14.6666581), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("done");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+    
+	result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula);
+    
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 14.666663), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+}
+
+TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) {
+	storm::settings::Settings* s = storm::settings::Settings::getInstance();
+	std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew")->as<storm::models::Mdp<double>>();
+
+	ASSERT_EQ(mdp->getNumberOfStates(), 3172ull);
+	ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull);
+
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+
+	storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+
+	std::vector<double> result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+
+	result = mc.checkNoBoundOperator(*rewardFormula);
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 4.285701547), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+
+	result = mc.checkNoBoundOperator(*rewardFormula);
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+	ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#else
+	ASSERT_LT(std::abs(result[0] - 4.285703591), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+#endif
+	delete rewardFormula;
+}
--- a/test/functional/solver/CudaPluginTest.cpp
+++ b/test/functional/solver/CudaPluginTest.cpp
@ -0,0 +1,354 @@
+#include "gtest/gtest.h"
+#include "src/storage/SparseMatrix.h"
+#include "src/exceptions/InvalidStateException.h"
+#include "src/exceptions/OutOfRangeException.h"
+
+#include "storm-config.h"
+
+#ifdef STORM_HAVE_CUDAFORSTORM
+
+#include "cudaForStorm.h"
+
+TEST(CudaPlugin, SpMV_4x4) {
+    storm::storage::SparseMatrixBuilder<double> matrixBuilder(4, 4, 10);
+    ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0));
+
+    
+    storm::storage::SparseMatrix<double> matrix;
+    ASSERT_NO_THROW(matrix = matrixBuilder.build());
+    
+    ASSERT_EQ(4, matrix.getRowCount());
+    ASSERT_EQ(4, matrix.getColumnCount());
+    ASSERT_EQ(10, matrix.getEntryCount());
+
+	std::vector<double> x({0, 4, 1, 1});
+	std::vector<double> b({0, 0, 0, 0});
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 3);
+	ASSERT_EQ(b.at(1), 25);
+	ASSERT_EQ(b.at(2), 16);
+	ASSERT_EQ(b.at(3), 0);
+}
+
+TEST(CudaPlugin, SpMV_4x4_float) {
+	storm::storage::SparseMatrixBuilder<float> matrixBuilder(4, 4, 10);
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f));
+
+
+	storm::storage::SparseMatrix<float> matrix;
+	ASSERT_NO_THROW(matrix = matrixBuilder.build());
+
+	ASSERT_EQ(4, matrix.getRowCount());
+	ASSERT_EQ(4, matrix.getColumnCount());
+	ASSERT_EQ(10, matrix.getEntryCount());
+
+	std::vector<float> x({ 0.f, 4.f, 1.f, 1.f });
+	std::vector<float> b({ 0.f, 0.f, 0.f, 0.f });
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 3);
+	ASSERT_EQ(b.at(1), 25);
+	ASSERT_EQ(b.at(2), 16);
+	ASSERT_EQ(b.at(3), 0);
+}
+
+TEST(CudaPlugin, SpMV_VerySmall) {
+	storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2);
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
+
+	storm::storage::SparseMatrix<double> matrix;
+	ASSERT_NO_THROW(matrix = matrixBuilder.build());
+
+	ASSERT_EQ(2, matrix.getRowCount());
+	ASSERT_EQ(2, matrix.getColumnCount());
+	ASSERT_EQ(2, matrix.getEntryCount());
+
+	std::vector<double> x({ 4.0, 8.0 });
+	std::vector<double> b({ 0.0, 0.0 });
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 4.0);
+	ASSERT_EQ(b.at(1), 16.0);
+}
+
+TEST(CudaPlugin, SpMV_VerySmall_float) {
+	storm::storage::SparseMatrixBuilder<float> matrixBuilder(2, 2, 2);
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
+	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
+
+	storm::storage::SparseMatrix<float> matrix;
+	ASSERT_NO_THROW(matrix = matrixBuilder.build());
+
+	ASSERT_EQ(2, matrix.getRowCount());
+	ASSERT_EQ(2, matrix.getColumnCount());
+	ASSERT_EQ(2, matrix.getEntryCount());
+
+	std::vector<float> x({ 4.0, 8.0 });
+	std::vector<float> b({ 0.0, 0.0 });
+
+	ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
+
+	ASSERT_EQ(b.at(0), 4.0);
+	ASSERT_EQ(b.at(1), 16.0);
+}
+
+TEST(CudaPlugin, AddVectorsInplace) {
+	std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+	std::vector<double> vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 };
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB));
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC));
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	for (size_t i = 0; i < vectorA_3.size(); ++i) {
+		double cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
+		double cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
+
+		ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
+		ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
+	}
+}
+
+TEST(CudaPlugin, AddVectorsInplace_float) {
+	std::vector<float> vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
+	std::vector<float> vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+	std::vector<float> vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f };
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB));
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC));
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	for (size_t i = 0; i < vectorA_3.size(); ++i) {
+		float cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
+		float cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
+
+		ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
+		ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
+	}
+}
+
+TEST(CudaPlugin, ReduceGroupedVector) {
+	std::vector<double> groupedVector = {
+		0.0, -1000.0, 0.000004, // Group 0
+		5.0,					// Group 1
+		0.0, 1.0, 2.0, 3.0,		// Group 2
+		-1000.0, -3.14, -0.0002,// Group 3 (neg only)
+		25.25, 25.25, 25.25,	// Group 4
+		0.0, 0.0, 1.0,			// Group 5
+		-0.000001, 0.000001		// Group 6
+	};
+	std::vector<uint_fast64_t> grouping = {
+		0, 3, 4, 8, 11, 14, 17, 19
+	};
+
+	std::vector<double> result_minimize = {
+		-1000.0, // Group 0
+		5.0,
+		0.0,
+		-1000.0,
+		25.25,
+		0.0,
+		-0.000001
+	};
+	std::vector<double> result_maximize = {
+		0.000004,
+		5.0,
+		3.0,
+		-0.0002,
+		25.25,
+		1.0,
+		0.000001
+	};
+
+	std::vector<double> result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+	std::vector<double> result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize));
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize));
+
+	for (size_t i = 0; i < result_minimize.size(); ++i) {
+		ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
+		ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
+	}
+}
+
+TEST(CudaPlugin, ReduceGroupedVector_float) {
+	std::vector<float> groupedVector = {
+		0.0f, -1000.0f, 0.000004f, // Group 0
+		5.0f,					// Group 1
+		0.0f, 1.0f, 2.0f, 3.0f,		// Group 2
+		-1000.0f, -3.14f, -0.0002f,// Group 3 (neg only)
+		25.25f, 25.25f, 25.25f,	// Group 4
+		0.0f, 0.0f, 1.0f,			// Group 5
+		-0.000001f, 0.000001f		// Group 6
+	};
+	std::vector<uint_fast64_t> grouping = {
+		0, 3, 4, 8, 11, 14, 17, 19
+	};
+
+	std::vector<float> result_minimize = {
+		-1000.0f, // Group 0
+		5.0f,
+		0.0f,
+		-1000.0f,
+		25.25f,
+		0.0f,
+		-0.000001f
+	};
+	std::vector<float> result_maximize = {
+		0.000004f,
+		5.0f,
+		3.0f,
+		-0.0002f,
+		25.25f,
+		1.0f,
+		0.000001f
+	};
+
+	std::vector<float> result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+	std::vector<float> result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize));
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize));
+
+	for (size_t i = 0; i < result_minimize.size(); ++i) {
+		ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
+		ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
+	}
+}
+
+TEST(CudaPlugin, equalModuloPrecision) {
+	std::vector<double> x = {
+		123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12
+	};
+	std::vector<double> y1 = {
+		0.45, 0.8, 0.23, 0.012, 0.456789, -0.12
+	};
+	std::vector<double> y2 = {
+		0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12
+	};
+	std::vector<double> x2;
+	std::vector<double> x3;
+	std::vector<double> y3;
+	std::vector<double> y4;
+	x2.reserve(1000);
+	x3.reserve(1000);
+	y3.reserve(1000);
+	y4.reserve(1000);
+	for (size_t i = 0; i < 1000; ++i) {
+		x2.push_back(static_cast<double>(i));
+		y3.push_back(1.0);
+		x3.push_back(-(1000.0 - static_cast<double>(i)));
+		y4.push_back(1.0);
+	}
+
+	double maxElement1 = 0.0;
+	double maxElement2 = 0.0;
+	double maxElement3 = 0.0;
+	double maxElement4 = 0.0;
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2));
+
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4));
+
+	ASSERT_DOUBLE_EQ(4567890.0, maxElement1);
+	ASSERT_DOUBLE_EQ(901.0, maxElement2);
+
+	ASSERT_DOUBLE_EQ(998.0, maxElement3);
+	ASSERT_DOUBLE_EQ(1001.0, maxElement4);
+}
+
+TEST(CudaPlugin, equalModuloPrecision_float) {
+	std::vector<float> x = {
+		123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f
+	};
+	std::vector<float> y1 = {
+		0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f
+	};
+	std::vector<float> y2 = {
+		0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f
+	};
+	std::vector<float> x2;
+	std::vector<float> x3;
+	std::vector<float> y3;
+	std::vector<float> y4;
+	x2.reserve(1000);
+	x3.reserve(1000);
+	y3.reserve(1000);
+	y4.reserve(1000);
+	for (size_t i = 0; i < 1000; ++i) {
+		x2.push_back(static_cast<float>(i));
+		y3.push_back(1.0f);
+		x3.push_back(-(1000.0f - static_cast<float>(i)));
+		y4.push_back(1.0f);
+	}
+
+	float maxElement1 = 0.0f;
+	float maxElement2 = 0.0f;
+	float maxElement3 = 0.0f;
+	float maxElement4 = 0.0f;
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2));
+
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4));
+
+	ASSERT_DOUBLE_EQ(4567890.0f, maxElement1);
+	ASSERT_DOUBLE_EQ(901.0f, maxElement2);
+
+	ASSERT_DOUBLE_EQ(998.0f, maxElement3);
+	ASSERT_DOUBLE_EQ(1001.0f, maxElement4);
+}
+
+#endif
--- a/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
+++ b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -0,0 +1,163 @@
+#include "gtest/gtest.h"
+#include "storm-config.h"
+
+#include "src/settings/Settings.h"
+#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h"
+#include "src/solver/NativeNondeterministicLinearEquationSolver.h"
+#include "src/parser/AutoParser.h"
+
+TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) {
+	storm::settings::Settings* s = storm::settings::Settings::getInstance();
+	std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew")->as<storm::models::Mdp<double>>();
+
+	ASSERT_EQ(mdp->getNumberOfStates(), 2095783ull);
+	ASSERT_EQ(mdp->getNumberOfTransitions(), 7714385ull);
+
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+
+	storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+
+	std::vector<double> result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false);
+
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete probFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+
+	result = mc.checkNoBoundOperator(*rewardFormula);
+
+	ASSERT_LT(std::abs(result[0] - 6.172433512), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete rewardFormula;
+
+	apFormula = new storm::property::prctl::Ap<double>("elected");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+
+	result = mc.checkNoBoundOperator(*rewardFormula);
+
+	ASSERT_LT(std::abs(result[0] - 6.1724344), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete rewardFormula;
+}
+
+TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Consensus) {
+	storm::settings::Settings* s = storm::settings::Settings::getInstance();
+    // Increase the maximal number of iterations, because the solver does not converge otherwise.
+	// This is done in the main cpp unit
+    
+	std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", "")->as<storm::models::Mdp<double>>();
+    
+	ASSERT_EQ(mdp->getNumberOfStates(), 63616ull);
+	ASSERT_EQ(mdp->getNumberOfTransitions(), 213472ull);
+    
+	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+    
+    storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("finished");
+	storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula);
+	storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+    
+	std::vector<double> result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+    
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+    storm::property::prctl::Ap<double>* apFormula2 = new storm::property::prctl::Ap<double>("all_coins_equal_0");
+    storm::property::prctl::And<double>* andFormula = new storm::property::prctl::And<double>(apFormula, apFormula2);
+	eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+
+	ASSERT_LT(std::abs(result[31168] - 0.4374282832), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+    
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+    apFormula2 = new storm::property::prctl::Ap<double>("all_coins_equal_1");
+    andFormula = new storm::property::prctl::And<double>(apFormula, apFormula2);
+    eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 0.5293286369), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+    
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+    apFormula2 = new storm::property::prctl::Ap<double>("agree");
+    storm::property::prctl::Not<double>* notFormula = new storm::property::prctl::Not<double>(apFormula2);
+    andFormula = new storm::property::prctl::And<double>(apFormula, notFormula);
+    eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 0.10414097), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+    
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+	storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 50ull);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+    
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+	boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 50ull);
+	probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false);
+    
+	result = mc.checkNoBoundOperator(*probFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+    delete probFormula;
+
+    apFormula = new storm::property::prctl::Ap<double>("finished");
+	storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true);
+    
+	result = mc.checkNoBoundOperator(*rewardFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 1725.593313), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete rewardFormula;
+    
+	apFormula = new storm::property::prctl::Ap<double>("finished");
+	reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula);
+	rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false);
+    
+	result = mc.checkNoBoundOperator(*rewardFormula);
+    
+	ASSERT_LT(std::abs(result[31168] - 2183.142422), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble());
+	delete rewardFormula;
+}