Final touches on cuda nondeterministic linear equation solver & modelchecker

Former-commit-id: c549ae0401
12 years ago · 8ebc0e4640
27 changed files with 70 additions and 2663 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -27,7 +27,6 @@ option(STORM_USE_INTELTBB "Sets whether the Intel TBB libraries should be used."
 option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precompiled headers)." OFF)
 option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF)
 option(USE_LIBCXX "Sets whether the standard library is libc++." OFF)
-option(STORM_USE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF)
 set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).")
 set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).")
 set(CUDA_ROOT "" CACHE STRING "The root directory of CUDA.")
@ -227,11 +226,7 @@ endif()
 set(STORM_CPP_GLPK_DEF "define")

 # CUDA Defines
-if (STORM_USE_CUDAFORSTORM)
-	set(STORM_CPP_CUDAFORSTORM_DEF "define")
-else()
-	set(STORM_CPP_CUDAFORSTORM_DEF "undef")
-endif()
+set(STORM_CPP_CUDAFORSTORM_DEF "undef")

 # Z3 Defines
 if (ENABLE_Z3)
@ -492,9 +487,6 @@ else(GMP_FOUND)
    endif(ENABLE_MSAT)
 endif(GMP_FOUND)

-if (STORM_USE_CUDAFORSTORM)
-	link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib")
-endif()
 if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL ""))
 	set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib")
 endif ()
@ -529,20 +521,6 @@ target_link_libraries(storm ${Boost_LIBRARIES})
 #message(STATUS "BOOST_INCLUDE_DIRS is ${Boost_INCLUDE_DIRS}")
 #message(STATUS "BOOST_LIBRARY_DIRS is ${Boost_LIBRARY_DIRS}")

-#############################################################
-##
-##	CUDA For Storm
-##
-#############################################################
-if (STORM_USE_CUDAFORSTORM)
-    message (STATUS "StoRM - Linking with CudaForStorm")
-	include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include")
-	include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm")
-    target_link_libraries(storm cudaForStorm)
-    target_link_libraries(storm-functional-tests cudaForStorm)
-    target_link_libraries(storm-performance-tests cudaForStorm)
-endif(STORM_USE_CUDAFORSTORM)
-
 #############################################################
 ##
 ##	CUDD
--- a/resources/cudaForStorm/CMakeAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp
@ -1,64 +0,0 @@
-/*
- * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
- */
- #include <cstdint>
- #include <utility>
- #include <vector>
- 
- #define CONTAINER_SIZE 100ul
- 
- template <typename IndexType, typename ValueType>
- int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
-	std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
-	for (size_t i = 0; i < containerSize; ++i) {
-		myVector->push_back(std::make_pair(firstValue, secondValue));
-	}
-	size_t myVectorSize = myVector->size();
-	IndexType* firstStart = &(myVector->at(0).first);
-	IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
-	ValueType* secondStart = &(myVector->at(0).second);
-	ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
-	size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
-	size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
-	size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
-	size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
-	
-	delete myVector;
-	myVector = nullptr;
-	
-	if (myVectorSize != containerSize) {
-		return -2;
-	}
-	
-	// Check for alignment:
-	// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
-	if (sizeof(IndexType) != sizeof(ValueType)) {
-		return -3;
-	}
-	if (startOffset != sizeof(IndexType)) {
-		return -4;
-	}
-	if (endOffset != sizeof(IndexType)) {
-		return -5;
-	}
-	if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
-		return -6;
-	}
-	if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
-		return -7;
-	}
-	
-	return 0;
- }
- 
- 
- int main(int argc, char* argv[]) {
-	int result = 0;
-	
-	result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
-	if (result != 0) {
-		return result;
-	}
-	
-	return 0;
- }
--- a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
@ -1,31 +0,0 @@
-/*
- * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries
- */
- #include <cstdint>
- #include <utility>
- #include <vector>
- 
- #define CONTAINER_SIZE 100ul
-
-int main(int argc, char* argv[]) {
-	int result = 0;
-
-	std::vector<std::pair<uint_fast64_t, float>> myVector;
-	for (size_t i = 0; i < CONTAINER_SIZE; ++i) {
-		myVector.push_back(std::make_pair(i, 42.12345f * i));
-	}
-	
-	char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first));
-	char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first));
-	ptrdiff_t uintDiff = secondUintPointer - firstUintPointer;
-	
-	if (uintDiff == (2 * sizeof(uint_fast64_t))) {
-		result = 2;
-	} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) {
-		result = 3;
-	} else {
-		result = -5;
-	}
-	
-	return result;
- }
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -1,294 +0,0 @@
-cmake_minimum_required (VERSION 2.8.6)
-
-# Set project name
-project (cudaForStorm CXX C)
-
-# Set the version number
-set (STORM_CPP_VERSION_MAJOR 1)
-set (STORM_CPP_VERSION_MINOR 0)
-
-# Add base folder for better inclusion paths
-include_directories("${PROJECT_SOURCE_DIR}")
-include_directories("${PROJECT_SOURCE_DIR}/src")
-
-message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}")
-
-#############################################################
-##
-##	CMake options of StoRM
-##
-#############################################################
-option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON)
-option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF)
-option(USE_LIBCXX "Sets whether the standard library is libc++." OFF)
-
-set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.")
-set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.")
-set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).")
-
-#############################################################
-##
-##	Inclusion of required libraries
-##
-#############################################################
-
-# Add the resources/cmake folder to Module Search Path for FindTBB.cmake
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")
-
-# Set the hint for CUSP
-set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary")
-
-find_package(CUDA REQUIRED)
-find_package(Cusp REQUIRED)
-find_package(Doxygen REQUIRED)
-find_package(Thrust REQUIRED)
-
-# If the DEBUG option was turned on, we will target a debug version and a release version otherwise
-if (CUDAFORSTORM_DEBUG)
-    set (CMAKE_BUILD_TYPE "DEBUG")
-else()
-    set (CMAKE_BUILD_TYPE "RELEASE")
-endif()
-message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.")
-
-message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
-message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}")
-
-#############################################################
-##
-##	CUDA Options
-##
-#############################################################
-SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE)
-set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
-set(BUILD_SHARED_LIBS OFF)
-set(CUDA_SEPARABLE_COMPILATION ON)
-#set(CUDA_NVCC_FLAGS "-arch=sm_30")
-
-# Because the FindCUDA.cmake file has a path related bug, two folders have to be present
-file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug")
-file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release")
-
-
-#############################################################
-##
-##	Compiler specific settings and definitions
-##
-#############################################################
-if(CMAKE_COMPILER_IS_GNUCC)
-    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC")
-    # Set standard flags for GCC
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic")
-elseif(MSVC)
-    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC")
-	# required for GMM to compile, ugly error directive in their code
-	add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS)
-	# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx)
-	add_definitions(/bigobj)
-	# required by GTest and PrismGrammar::createIntegerVariable
-	add_definitions(/D_VARIADIC_MAX=10)
-	# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max
-	add_definitions(/DNOMINMAX)
-else(CLANG)
-    message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)")
-	# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler.
-	set (CLANG ON)
-    # Set standard flags for clang
-    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3")
-    if(UNIX AND NOT APPLE AND NOT USE_LIBCXX)
-		set(CLANG_STDLIB libstdc++)
-		message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++")
-    else()
-		set(CLANG_STDLIB libc++)
-		message(STATUS "StoRM (CudaPlugin) - Linking against libc++")
-		# Set up some Xcode specific settings
-		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11")
-		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
-    endif()
-    
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024")
-    
-    set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
-endif()
-
-#############################################################
-##
-##	CMake-generated Config File for StoRM
-##
-#############################################################
-
-# Test for type alignment
-try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
-	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp"
-	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
-)
-if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
-elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
-	message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
-else()
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
-endif()
-
-# Test for Float 64bit Alignment
-try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT
-	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp"
-	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
-)
-if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT)
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
-elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2)
-	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.")
-	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define")
-elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3)
-	message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.")
-	set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef")
-else()
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})")
-endif()
-
-
-#
-# Make a version file containing the current version from git.
-#
-include(GetGitRevisionDescription)
-git_describe_checkout(STORM_GIT_VERSION_STRING)
-# Parse the git Tag into variables
-string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}")
-string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}")
-string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}")
-string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}")
-string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}")
-string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}")
-if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$")
-	set(STORM_CUDAPLUGIN_VERSION_DIRTY 1)
-else()
-	set(STORM_CUDAPLUGIN_VERSION_DIRTY 0)
-endif()
-message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})")
-
-
-# Configure a header file to pass some of the CMake settings to the source code
-configure_file (
-	"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in"
-	"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
-)
-# Add the binary dir include directory for storm-config.h
-include_directories("${PROJECT_BINARY_DIR}/include")
-
-# Add the main source directory for includes
-include_directories("${PROJECT_SOURCE_DIR}/../../src")
-
-#############################################################
-##
-##	Source file aggregation and clustering
-##
-#############################################################
-file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h)
-file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp)
-
-file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu")
-file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h")
-
-# Additional include files like the storm-config.h
-file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h)
-
-# Group the headers and sources
-source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
-source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS})
-
-# Add custom additional include or link directories
-if (ADDITIONAL_INCLUDE_DIRS)
-	message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}")
-	include_directories(${ADDITIONAL_INCLUDE_DIRS})
-endif(ADDITIONAL_INCLUDE_DIRS)
-if (ADDITIONAL_LINK_DIRS)
-	message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}")
-	link_directories(${ADDITIONAL_LINK_DIRS})
-endif(ADDITIONAL_LINK_DIRS)
-
-#############################################################
-##
-##	Pre executable-creation link_directories setup
-##
-#############################################################
-
-
-
-#############################################################
-##
-##	CUDA
-##
-#############################################################
-#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
-#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
-#ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
-#target_link_libraries(cudaForStorm cudaLibrary)
-message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}")
-include_directories(${CUDA_INCLUDE_DIRS})
-
-#############################################################
-##
-##	CUSP
-##
-#############################################################
-if(CUSP_FOUND)
-	include_directories(${CUSP_INCLUDE_DIR})
-	cuda_include_directories(${CUSP_INCLUDE_DIR})
-	message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
-else()
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
-endif()
-
-#############################################################
-##
-##	Thrust
-##
-#############################################################
-if(THRUST_FOUND)
-	include_directories(${THRUST_INCLUDE_DIR})
-	cuda_include_directories(${THRUST_INCLUDE_DIR})
-	message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
-else()
-	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
-endif()
-
-###############################################################################
-##                                                                            #
-##	Executable Creation                                                       #
-##                                                                            #
-##  All link_directories() calls AND include_directories() calls              #
-##  MUST be made before this point                                            #
-##                                                                            #
-###############################################################################
-include (GenerateExportHeader)
-
-cuda_add_library(cudaForStorm SHARED
-  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
-  OPTIONS -DSTUFF="" -arch=sm_30
-  RELEASE -DNDEBUG
-  DEBUG -g -DDEBUG
-)
-GENERATE_EXPORT_HEADER( cudaForStorm
-             BASE_NAME cudaForStorm
-             EXPORT_MACRO_NAME cudaForStorm_EXPORT
-             EXPORT_FILE_NAME include/cudaForStorm_Export.h
-             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
-)
-
-if (MSVC)
-	# Add the DebugHelper DLL
-	set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib")
-	target_link_libraries(cudaForStorm "Dbghelp.lib")   
-endif(MSVC)
-
-# Link against libc++abi if requested. May be needed to build on Linux systems using clang.
-if (LINK_LIBCXXABI)
-	message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.")
-	target_link_libraries(cudaForStorm "c++abi")
-endif(LINK_LIBCXXABI)
-
-# Install Directive
-install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib")
-install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include")
--- a/resources/cudaForStorm/srcCuda/allCudaKernels.h
+++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h
@ -1,6 +0,0 @@
-#include "utility.h"
-#include "bandWidth.h"
-#include "basicAdd.h"
-#include "kernelSwitchTest.h"
-#include "basicValueIteration.h"
-#include "version.h"
--- a/resources/cudaForStorm/srcCuda/bandWidth.cu
+++ b/resources/cudaForStorm/srcCuda/bandWidth.cu
--- a/resources/cudaForStorm/srcCuda/bandWidth.h
+++ b/resources/cudaForStorm/srcCuda/bandWidth.h
--- a/resources/cudaForStorm/srcCuda/basicAdd.cu
+++ b/resources/cudaForStorm/srcCuda/basicAdd.cu
@ -1,286 +0,0 @@
-#include <cuda.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <chrono>
-#include <iostream>
-
-__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { 
-	*c = a + b; 
-}
-
-__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
-	// Fused Multiply Add:
-	// A * B + C => D
-
-	/*
-     *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd
-     *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche
-     *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente
-     *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des
-     *Threads, der die Funktion gerade ausführt.
-    */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if (i < N) {
-		D[i] = A[i] * B[i] + C[i];
-	}
-}
-
-__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) {
-	// Fused Multiply Add:
-	// A * B + C => D
-
-	// Layout:
-	// A B C D A B C D A B C D
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if ((i*M) < N) {
-		for (int j = i*M; j < i*M + M; ++j) {
-			A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2];
-		}
-	}
-}
-
-extern "C" int cuda_basicAdd(int a, int b) {
-	int c = 0;
-	int *dev_c;
-	cudaMalloc((void**)&dev_c, sizeof(int));
-	cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c);
-	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
-	//printf("%d + %d + 42 is %d\n", a, b, c);
-	cudaFree(dev_c);
-	return c;
-}
-
-void cpp_cuda_bandwidthTest(int entryCount, int N) {
-	// Size of the Arrays
-	size_t arraySize = entryCount * sizeof(int);
-	
-	int* deviceIntArray;
-	int* hostIntArray = new int[arraySize];
-
-	// Allocate space on the device
-	auto start_time = std::chrono::high_resolution_clock::now();
-	for (int i = 0; i < N; ++i) {
-		if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
-			std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl;
-			delete[] hostIntArray;
-			return;
-		}
-		// Free memory on device
-		if (cudaFree(deviceIntArray) != cudaSuccess) {
-			std::cout << "Error in cudaFree!" << std::endl;
-			delete[] hostIntArray;
-			return;
-		}
-	}
-	auto end_time = std::chrono::high_resolution_clock::now();
-	auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-	double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
-	std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
-	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl;
-
-	if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
-		std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl;
-		delete[] hostIntArray;
-		return;
-	}
-	
-	// Prepare data
-	for (int i = 0; i < N; ++i) {
-		hostIntArray[i] = i * 333 + 123;
-	}
-
-	// Copy data TO device
-	start_time = std::chrono::high_resolution_clock::now();
-	for (int i = 0; i < N; ++i) {
-		if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
-			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl;
-			// Free memory on device
-			if (cudaFree(deviceIntArray) != cudaSuccess) {
-				std::cout << "Error in cudaFree!" << std::endl;
-			}
-			delete[] hostIntArray;
-			return;
-		}
-	}
-	end_time = std::chrono::high_resolution_clock::now();
-	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
-	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
-	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl;
-
-	// Copy data FROM device
-	start_time = std::chrono::high_resolution_clock::now();
-	for (int i = 0; i < N; ++i) {
-		if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
-			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl;
-			// Free memory on device
-			if (cudaFree(deviceIntArray) != cudaSuccess) {
-				std::cout << "Error in cudaFree!" << std::endl;
-			}
-			delete[] hostIntArray;
-			return;
-		}
-	}
-	end_time = std::chrono::high_resolution_clock::now();
-	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
-	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
-	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl;
-
-	// Free memory on device
-	if (cudaFree(deviceIntArray) != cudaSuccess) {
-		std::cout << "Error in cudaFree!" << std::endl;
-	}
-	delete[] hostIntArray;
-}
-
-extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
-	// Size of the Arrays
-	size_t arraySize = N * sizeof(int);
-	
-	int* deviceIntArrayA;
-	int* deviceIntArrayB;
-	int* deviceIntArrayC;
-	int* deviceIntArrayD;
-
-	// Allocate space on the device
-	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
-		printf("Error in cudaMalloc1!\n");
-		return;
-	}
-	if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) {
-		printf("Error in cudaMalloc2!\n");
-		cudaFree(deviceIntArrayA);
-		return;
-	}
-	if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) {
-		printf("Error in cudaMalloc3!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		return;
-	}
-	if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) {
-		printf("Error in cudaMalloc4!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		cudaFree(deviceIntArrayC);
-		return;
-	}
-	
-	// Copy data TO device
-	if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		cudaFree(deviceIntArrayC);
-		cudaFree(deviceIntArrayD);
-		return;
-	}
-	if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		cudaFree(deviceIntArrayC);
-		cudaFree(deviceIntArrayD);
-		return;
-	}
-	if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		cudaFree(deviceIntArrayC);
-		cudaFree(deviceIntArrayD);
-		return;
-	}
-	
-    // Festlegung der Threads pro Block
-    int threadsPerBlock = 512;
-    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
-
-	// Run kernel
-	cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N);
-
-	// Copy data FROM device
-	if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		cudaFree(deviceIntArrayB);
-		cudaFree(deviceIntArrayC);
-		cudaFree(deviceIntArrayD);
-		return;
-	}
-
-	// Free memory on device
-	cudaFree(deviceIntArrayA);
-	cudaFree(deviceIntArrayB);
-	cudaFree(deviceIntArrayC);
-	cudaFree(deviceIntArrayD);
-}
-
-extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) {
-	// Size of the Arrays
-	size_t arraySize = N * sizeof(int) * 4;
-	
-	int* deviceIntArrayA;
-
-	// Allocate space on the device
-	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
-		printf("Error in cudaMalloc1!\n");
-		return;
-	}
-
-#define ONFAILFREE0() do { } while(0)
-#define ONFAILFREE1(a) do { cudaFree(a); } while(0)
-#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0)
-#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0)
-#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0)
-#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0)
-
-	// Copy data TO device
-
-	CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice);
-
-	/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		return;
-	}*/
-	
-    // Festlegung der Threads pro Block
-    int threadsPerBlock = 512;
-    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
-
-	// Run kernel
-	cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M);
-
-	// Copy data FROM device
-	if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
-		printf("Error in cudaMemcpy!\n");
-		cudaFree(deviceIntArrayA);
-		return;
-	}
-
-	// Free memory on device
-	if (cudaFree(deviceIntArrayA) != cudaSuccess) {
-		printf("Error in cudaFree!\n");
-		return;
-	}
-}
-
-extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
-	for (int i = 0; i < N; ++i) {
-		D[i] = A[i] * B[i] + C[i];
-	}
-}
-
-extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) {
-	for (int i = 0; i < N; i += 4) {
-		A[i+3] = A[i] * A[i+1] + A[i+2];
-	}
-}
--- a/resources/cudaForStorm/srcCuda/basicAdd.h
+++ b/resources/cudaForStorm/srcCuda/basicAdd.h
@ -1,9 +0,0 @@
-extern "C" int cuda_basicAdd(int a, int b);
-
-extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M);
-extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N);
-
-extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N);
-extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N);
-
-void cpp_cuda_bandwidthTest(int entryCount, int N);
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -1,879 +0,0 @@
-#include "basicValueIteration.h"
-#define CUSP_USE_TEXTURE_MEMORY
-
-#include <iostream>
-#include <chrono>
-
-#include <cuda_runtime.h>
-#include "cusparse_v2.h"
-
-#include "utility.h"
-
-#include "cuspExtension.h"
-
-#include <thrust/transform.h>
-#include <thrust/device_ptr.h>
-#include <thrust/functional.h>
-
-#include "storm-cudaplugin-config.h"
-
-#ifdef DEBUG
-#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync  = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize();	if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
-#else
-#define CUDA_CHECK_ALL_ERRORS() do {} while (false)
-#endif
-
-template<typename T, bool Relative>
-struct equalModuloPrecision : public thrust::binary_function<T,T,T>
-{
-__host__ __device__ T operator()(const T &x, const T &y) const
-{
-    if (Relative) {
-		if (y == 0) {
-			return ((x >= 0) ? (x) : (-x));
-		}
-		const T result = (x - y) / y;
-		return ((result >= 0) ? (result) : (-result));
-    } else {
-        const T result = (x - y);
-		return ((result >= 0) ? (result) : (-result));
-    }
-}
-};
-
-template<typename IndexType, typename ValueType>
-void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
-	indexVector.reserve(inputVector.size());
-	valueVector.reserve(inputVector.size());
-	for (size_t i = 0; i < inputVector.size(); ++i) {
-		indexVector.push_back(inputVector.at(i).first);
-		valueVector.push_back(inputVector.at(i).second);
-	}
-}
-
-// TEMPLATE VERSION
-template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
-bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
-	//std::vector<IndexType> matrixColumnIndices;
-	//std::vector<ValueType> matrixValues;
-	//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
-	bool errorOccured = false;
-
-	IndexType* device_matrixRowIndices = nullptr;
-	ValueType* device_matrixColIndicesAndValues = nullptr;
-	ValueType* device_x = nullptr;
-	ValueType* device_xSwap = nullptr;
-	ValueType* device_b = nullptr;
-	ValueType* device_multiplyResult = nullptr;
-	IndexType* device_nondeterministicChoiceIndices = nullptr;
-
-#ifdef DEBUG
-	std::cout.sync_with_stdio(true);
-	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
-	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
-	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
-	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
-#endif
-
-	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
-	const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
-	const IndexType matrixNnzCount = columnIndicesAndValues.size();
-
-	cudaError_t cudaMallocResult;
-
-	bool converged = false;
-	iterationCount = 0;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
-#else
-#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
-#endif
-	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
-		CUDA_CHECK_ALL_ERRORS();
-		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
-		if (cudaMallocResult != cudaSuccess) {
-			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
-			errorOccured = true;
-			goto cleanup;
-		}
-	} else {
-		CUDA_CHECK_ALL_ERRORS();
-		cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
-		if (cudaMallocResult != cudaSuccess) {
-			std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
-			errorOccured = true;
-			goto cleanup;
-		}
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1));
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished allocating memory." << std::endl;
-#endif
-
-	// Memory allocated, copy data to device
-	cudaError_t cudaCopyResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	// Copy all data as floats are expanded to 64bits :/
-	if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
-		CUDA_CHECK_ALL_ERRORS();
-		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
-		if (cudaCopyResult != cudaSuccess) {
-			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
-			errorOccured = true;
-			goto cleanup;
-		}
-	} else {
-		CUDA_CHECK_ALL_ERRORS();
-		cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
-		if (cudaCopyResult != cudaSuccess) {
-			std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
-			errorOccured = true;
-			goto cleanup;
-		}
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	// Preset the xSwap to zeros...
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	// Preset the multiplyResult to zeros...
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
-#endif
-
-	// Data is on device, start Kernel
-	while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
-		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
-		CUDA_CHECK_ALL_ERRORS();
-
-		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
-		thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
-
-		// Transform: Add multiplyResult + b inplace to multiplyResult
-		thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
-		CUDA_CHECK_ALL_ERRORS();
-
-		// Reduce: Reduce multiplyResult to a new x vector
-		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
-		CUDA_CHECK_ALL_ERRORS();
-
-		// Check for convergence
-		// Transform: x = abs(x - xSwap)/ xSwap
-		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
-		thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
-		thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
-		thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
-		CUDA_CHECK_ALL_ERRORS();
-
-		// Reduce: get Max over x and check for res < Precision
-		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
-		CUDA_CHECK_ALL_ERRORS();
-		converged = (maxX < precision);
-		++iterationCount;
-
-		// Swap pointers, device_x always contains the most current result
-		std::swap(device_x, device_xSwap);
-	}
-
-	if (!converged && (iterationCount == maxIterationCount)) {
-		iterationCount = 0;
-		errorOccured = true;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished kernel execution." << std::endl;
-	std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
-#endif
-
-	// Get x back from the device
-	cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
-		errorOccured = true;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished copying result data." << std::endl;
-#endif
-
-	// All code related to freeing memory and clearing up the device
-cleanup:
-	if (device_matrixRowIndices != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_matrixRowIndices = nullptr;
-	}
-	if (device_matrixColIndicesAndValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_matrixColIndicesAndValues = nullptr;
-	}
-	if (device_x != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_x);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_x = nullptr;
-	}
-	if (device_xSwap != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_xSwap);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_xSwap = nullptr;
-	}
-	if (device_b != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_b);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_b = nullptr;
-	}
-	if (device_multiplyResult != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_multiplyResult = nullptr;
-	}
-	if (device_nondeterministicChoiceIndices != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl;
-			errorOccured = true;
-		}
-		device_nondeterministicChoiceIndices = nullptr;
-	}
-#ifdef DEBUG
-	std::cout << "(DLL) Finished cleanup." << std::endl;
-#endif
-
-	return !errorOccured;
-}
-
-template <typename IndexType, typename ValueType>
-void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
-	IndexType* device_matrixRowIndices = nullptr;
-	ValueType* device_matrixColIndicesAndValues = nullptr;
-	ValueType* device_x = nullptr;
-	ValueType* device_multiplyResult = nullptr;
-
-#ifdef DEBUG
-	std::cout.sync_with_stdio(true);
-	std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
-	std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; 
-	size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
-	std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
-#endif
-
-	const IndexType matrixRowCount = matrixRowIndices.size() - 1;
-	const IndexType matrixNnzCount = columnIndicesAndValues.size();
-
-	cudaError_t cudaMallocResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-#else
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-#endif
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished allocating memory." << std::endl;
-#endif
-
-	// Memory allocated, copy data to device
-	cudaError_t cudaCopyResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-#else
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-#endif
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	// Preset the multiplyResult to zeros...
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
-#endif
-
-	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
-	CUDA_CHECK_ALL_ERRORS();
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished kernel execution." << std::endl;
-#endif
-
-	// Get result back from the device
-	cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-#ifdef DEBUG
-	std::cout << "(DLL) Finished copying result data." << std::endl;
-#endif
-
-	// All code related to freeing memory and clearing up the device
-cleanup:
-	if (device_matrixRowIndices != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_matrixRowIndices = nullptr;
-	}
-	if (device_matrixColIndicesAndValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_matrixColIndicesAndValues = nullptr;
-	}
-	if (device_x != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_x);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_x = nullptr;
-	}
-	if (device_multiplyResult != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_multiplyResult = nullptr;
-	}
-#ifdef DEBUG
-	std::cout << "(DLL) Finished cleanup." << std::endl;
-#endif
-}
-
-template <typename ValueType>
-void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
-	ValueType* device_a = nullptr;
-	ValueType* device_b = nullptr;
-
-	const size_t vectorSize = std::max(a.size(), b.size());
-
-	cudaError_t cudaMallocResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	// Memory allocated, copy data to device
-	cudaError_t cudaCopyResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-	
-	do {
-		// Transform: Add multiplyResult + b inplace to multiplyResult
-		thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
-		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
-		thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
-		CUDA_CHECK_ALL_ERRORS();
-	} while (false);
-
-	// Get result back from the device
-	cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	// All code related to freeing memory and clearing up the device
-cleanup:
-	if (device_a != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_a);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_a = nullptr;
-	}
-	if (device_b != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_b);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_b = nullptr;
-	}
-}
-
-template <typename IndexType, typename ValueType, bool Minimize>
-void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
-	ValueType* device_groupedVector = nullptr;
-	IndexType* device_grouping = nullptr;
-	ValueType* device_target = nullptr;
-
-	const size_t groupedSize = groupedVector.size();
-	const size_t groupingSize = grouping.size();
-	const size_t targetSize = targetVector.size();
-
-	cudaError_t cudaMallocResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	// Memory allocated, copy data to device
-	cudaError_t cudaCopyResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-	
-	do {
-		// Reduce: Reduce multiplyResult to a new x vector
-		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
-		CUDA_CHECK_ALL_ERRORS();
-	} while (false);
-
-	// Get result back from the device
-	cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	// All code related to freeing memory and clearing up the device
-cleanup:
-	if (device_groupedVector != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_groupedVector = nullptr;
-	}
-	if (device_grouping != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_grouping);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_grouping = nullptr;
-	}
-	if (device_target != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_target);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_target = nullptr;
-	}
-}
-
-template <typename ValueType, bool Relative>
-void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
-	ValueType* device_x = nullptr;
-	ValueType* device_y = nullptr;
-
-	const size_t vectorSize = x.size();
-
-	cudaError_t cudaMallocResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
-	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
-		goto cleanup;
-	}
-
-	// Memory allocated, copy data to device
-	cudaError_t cudaCopyResult;
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-
-	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
-	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
-		goto cleanup;
-	}
-	
-	do {
-		// Transform: x = abs(x - xSwap)/ xSwap
-		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
-		thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
-		thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
-		CUDA_CHECK_ALL_ERRORS();
-
-		// Reduce: get Max over x and check for res < Precision
-		maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
-		CUDA_CHECK_ALL_ERRORS();
-	} while (false);
-
-	// All code related to freeing memory and clearing up the device
-cleanup:
-	if (device_x != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_x);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_x = nullptr;
-	}
-	if (device_y != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_y);
-		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
-		}
-		device_y = nullptr;
-	}
-}
-
-/*
- * Declare and implement all exported functions for these Kernels here
- *
- */
-
-void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
-	basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
-}
-
-void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
-	basicValueIteration_addVectorsInplace<double>(a, b);
-}
-
-void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
-	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
-}
-
-void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
-	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
-}
-
-void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
-	basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
-}
-
-void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
-	basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
-}
-
-// Float
-void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
-	basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
-}
-
-void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
-	basicValueIteration_addVectorsInplace<float>(a, b);
-}
-
-void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
-	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
-}
-
-void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
-	basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
-}
-
-void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
-	basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
-}
-
-void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
-	basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
-}
-
-bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
-	if (relativePrecisionCheck) {
-		return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	} else {
-		return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	}
-}
-
-bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
-	if (relativePrecisionCheck) {
-		return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	} else {
-		return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	}
-}
-
-bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
-	if (relativePrecisionCheck) {
-		return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	} else {
-		return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	}
-}
-
-bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
-	if (relativePrecisionCheck) {
-		return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	} else {
-		return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
-	}
-}
-
-size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
-	size_t const valueTypeSize = sizeof(double);
-	size_t const indexTypeSize = sizeof(uint_fast64_t);
-
-	/*
-	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndices = nullptr;
-	ValueType* device_matrixValues = nullptr;
-	ValueType* device_x = nullptr;
-	ValueType* device_xSwap = nullptr;
-	ValueType* device_b = nullptr;
-	ValueType* device_multiplyResult = nullptr;
-	IndexType* device_nondeterministicChoiceIndices = nullptr;
-	*/
-
-	// Row Indices, Column Indices, Values, Choice Indices
-	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
-	// Vectors x, xSwap, b, multiplyResult
-	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
-
-	return (matrixDataSize + vectorSizes);
-}
-
-size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
-	size_t const valueTypeSize = sizeof(float);
-	size_t const indexTypeSize = sizeof(uint_fast64_t);
-
-	/*
-	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndices = nullptr;
-	ValueType* device_matrixValues = nullptr;
-	ValueType* device_x = nullptr;
-	ValueType* device_xSwap = nullptr;
-	ValueType* device_b = nullptr;
-	ValueType* device_multiplyResult = nullptr;
-	IndexType* device_nondeterministicChoiceIndices = nullptr;
-	*/
-
-	// Row Indices, Column Indices, Values, Choice Indices
-	size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
-	// Vectors x, xSwap, b, multiplyResult
-	size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
-
-	return (matrixDataSize + vectorSizes);
-}
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -1,107 +0,0 @@
-#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
-#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
-
-#include <cstdint>
-#include <vector>
-#include <utility>
-
-// Library exports
-#include "cudaForStorm_Export.h"
-
-/* Helper declaration to cope with new internal format */
-#ifndef STORM_STORAGE_SPARSEMATRIX_H_
-namespace storm {
-	namespace storage {
-template<typename T>
-    class MatrixEntry {
-    public:
-        /*!
-            * Constructs a matrix entry with the given column and value.
-            *
-            * @param column The column of the matrix entry.
-            * @param value The value of the matrix entry.
-            */
-        MatrixEntry(uint_fast64_t column, T value);
-            
-        /*!
-            * Move-constructs the matrix entry fro the given column-value pair.
-            *
-            * @param pair The column-value pair from which to move-construct the matrix entry.
-            */
-        MatrixEntry(std::pair<uint_fast64_t, T>&& pair);
-            
-        //MatrixEntry() = default;
-        //MatrixEntry(MatrixEntry const& other) = default;
-        //MatrixEntry& operator=(MatrixEntry const& other) = default;
-#ifndef WINDOWS
-        //MatrixEntry(MatrixEntry&& other) = default;
-        //MatrixEntry& operator=(MatrixEntry&& other) = default;
-#endif
-            
-        /*!
-            * Retrieves the column of the matrix entry.
-            *
-            * @return The column of the matrix entry.
-            */
-        uint_fast64_t const& getColumn() const;
-            
-        /*!
-            * Retrieves the column of the matrix entry.
-            *
-            * @return The column of the matrix entry.
-            */
-        uint_fast64_t& getColumn();
-            
-        /*!
-            * Retrieves the value of the matrix entry.
-            *
-            * @return The value of the matrix entry.
-            */
-        T const& getValue() const;
-
-        /*!
-            * Retrieves the value of the matrix entry.
-            *
-            * @return The value of the matrix entry.
-            */
-        T& getValue();
-            
-        /*!
-            * Retrieves a pair of column and value that characterizes this entry.
-            *
-            * @return A column-value pair that characterizes this entry.
-            */
-        std::pair<uint_fast64_t, T> const& getColumnValuePair() const;
-            
-    private:
-        // The actual matrix entry.
-        std::pair<uint_fast64_t, T> entry;
-    };
-
-	}
-}
-#endif
-
-cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
-cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
-cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
-
-cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
-cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
-cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
-
-cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
-cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
-cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
-cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
-cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
-cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
-
-cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b);
-cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b);
-cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
-cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
-cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
-cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
-
-#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/resources/cudaForStorm/srcCuda/cudaForStorm.h
+++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h
@ -1,19 +0,0 @@
-#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_
-#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_
-
-/*
- * List of exported functions in this library
- */
-
-// TopologicalValueIteration
-#include "srcCuda/basicValueIteration.h"
-
-// Utility Functions
-#include "srcCuda/utility.h"
-
-// Version Information
-#include "srcCuda/version.h"
-
-
-
-#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -1,49 +0,0 @@
-#pragma once
-
-#include "cuspExtensionFloat.h"
-#include "cuspExtensionDouble.h"
-
-namespace cusp {
-namespace detail {
-namespace device {
-
-template <typename ValueType>
-void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) {
-	//
-	throw;
-}
-template <>
-void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) {
-	storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-template <>
-void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) {
-	storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-
-template <bool Minimize, typename ValueType>
-void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) {
-	//
-	throw;
-}
-template <>
-void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
-	storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
-}
-template <>
-void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
-	storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
-}
-
-template <>
-void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
-	storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
-}
-template <>
-void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
-	storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
-}
-
-} // end namespace device
-} // end namespace detail
-} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
@ -1,361 +0,0 @@
-/*
- * This is an extension of the original CUSP csr_vector.h SPMV implementation.
- * It is based on the Code and incorporates changes as to cope with the details
- * of the StoRM code.
- * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm
- */
-
-/*
- *  Copyright 2008-2009 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <limits>
-#include <cstdint>
-#include <algorithm>
-
-#include <math_functions.h>
-
-#include <cusp/detail/device/spmv/csr_vector.h>
-
-namespace cusp
-{
-namespace detail
-{
-namespace device
-{
-
-//////////////////////////////////////////////////////////////////////////////
-// CSR SpMV kernels based on a vector model (one warp per row)
-//////////////////////////////////////////////////////////////////////////////
-//
-// spmv_csr_vector_device
-//   Each row of the CSR matrix is assigned to a warp.  The warp computes
-//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
-//   the x vector, in parallel.  This division of work implies that 
-//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
-//   manner (but generally not aligned).  On GT200 these accesses are
-//   coalesced, unlike kernels based on the one-row-per-thread division of 
-//   work.  Since an entire 32-thread warp is assigned to each row, many 
-//   threads will remain idle when their row contains a small number 
-//   of elements.  This code relies on implicit synchronization among 
-//   threads in a warp.
-//
-// spmv_csr_vector_tex_device
-//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
-//   used for accessing the x vector.
-//  
-//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
-
-
-template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
-__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
-__global__ void
-storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
-{
-    __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
-    
-    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
-
-    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
-    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
-    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
-    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
-
-        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local sum
-        double sum = 0;
-     
-        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end) {
-				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-			}
-				
-            // accumulate local sums
-            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
-			}
-        } else {
-            // accumulate local sums
-            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-				sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
-			}
-        }
-
-        // store local sum in shared memory
-        sdata[threadIdx.x] = sum;
-        
-        // reduce local sums to row sum
-        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
-        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
-        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
-        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
-        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            y[row] = sdata[threadIdx.x];
-    }
-}
-
-template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
-__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
-__global__ void
-storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
-{
-    __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
-    
-    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
-
-    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
-    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
-    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
-    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
-
-        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local Min/Max
-        double localMinMaxElement = minMaxInitializer;
-
-        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end) {
-				if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-			}
-
-            // accumulate local sums
-            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
-        else
-        {
-            // accumulate local sums
-            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
-
-        // store local sum in shared memory
-        sdata[threadIdx.x] = localMinMaxElement;
-        
-        // reduce local min/max to row min/max
-		if (Minimize) {
-			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
-
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
-		} else {
-			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
-			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
-			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
-			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
-			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
-		}
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            x[row] = sdata[threadIdx.x];
-    }
-}
-
-template <bool Minimize, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
-{
-	double __minMaxInitializer = -std::numeric_limits<double>::max();
-	if (Minimize) {
-		__minMaxInitializer = std::numeric_limits<double>::max();
-	}
-	const double minMaxInitializer = __minMaxInitializer;
-
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-
-    storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
-}
-
-template <bool Minimize>
-void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
-{
-    const uint_fast64_t rows_per_group = num_entries / num_rows;
-
-    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    
-    __storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
-}
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
-}
-
-void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    
-    __storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-
-void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-
-    __storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-
-// NON-OPT
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
-}
-
-void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    
-    __storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-}
-
-void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-
-    __storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-}
-
-} // end namespace device
-} // end namespace detail
-} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
@ -1,375 +0,0 @@
-/*
- * This is an extension of the original CUSP csr_vector.h SPMV implementation.
- * It is based on the Code and incorporates changes as to cope with the details
- * of the StoRM code.
- * As this is mostly copy & paste, the original license still applies.
- */
-
-/*
- *  Copyright 2008-2009 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <limits>
-#include <cstdint>
-#include <algorithm>
-
-#include <math_functions.h>
-
-#include <cusp/detail/device/spmv/csr_vector.h>
-
-#include "storm-cudaplugin-config.h"
-
-namespace cusp
-{
-namespace detail
-{
-namespace device
-{
-
-//////////////////////////////////////////////////////////////////////////////
-// CSR SpMV kernels based on a vector model (one warp per row)
-//////////////////////////////////////////////////////////////////////////////
-//
-// spmv_csr_vector_device
-//   Each row of the CSR matrix is assigned to a warp.  The warp computes
-//   y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with 
-//   the x vector, in parallel.  This division of work implies that 
-//   the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
-//   manner (but generally not aligned).  On GT200 these accesses are
-//   coalesced, unlike kernels based on the one-row-per-thread division of 
-//   work.  Since an entire 32-thread warp is assigned to each row, many 
-//   threads will remain idle when their row contains a small number 
-//   of elements.  This code relies on implicit synchronization among 
-//   threads in a warp.
-//
-// spmv_csr_vector_tex_device
-//   Same as spmv_csr_vector_tex_device, except that the texture cache is 
-//   used for accessing the x vector.
-//  
-//  Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
-
-
-template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
-__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
-__global__ void
-storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
-{
-    __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
-    
-    const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
-
-    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);          // thread index within the vector
-    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_VECTOR;               // global vector index
-    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_VECTOR;               // vector index within the block
-    const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
-
-        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local sum
-        float sum = 0;
-     
-        if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end) {
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
-#else
-				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
-#endif
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-			}
-				
-            // accumulate local sums
-            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
-#else
-				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
-#endif
-			}
-        } else {
-            // accumulate local sums
-            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
-                //sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
-#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-				sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
-#else
-				sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
-#endif
-			}
-        }
-
-        // store local sum in shared memory
-        sdata[threadIdx.x] = sum;
-        
-        // reduce local sums to row sum
-        if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
-        if (THREADS_PER_VECTOR >  8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  8];
-        if (THREADS_PER_VECTOR >  4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  4];
-        if (THREADS_PER_VECTOR >  2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  2];
-        if (THREADS_PER_VECTOR >  1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x +  1];
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            y[row] = sdata[threadIdx.x];
-    }
-}
-
-template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
-__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
-__global__ void
-storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
-{
-    __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
-    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
-    
-    const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
-
-    const uint_fast64_t thread_id   = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;    // global thread index
-    const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1);          // thread index within the vector
-    const uint_fast64_t vector_id   = thread_id   /  THREADS_PER_ROW;               // global vector index
-    const uint_fast64_t vector_lane = threadIdx.x /  THREADS_PER_ROW;               // vector index within the block
-    const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x;                   // total number of active vectors
-
-    for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
-    {
-        // use two threads to fetch Ap[row] and Ap[row+1]
-        // this is considerably faster than the straightforward version
-        if(thread_lane < 2)
-            ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
-
-        const uint_fast64_t row_start = ptrs[vector_lane][0];                   //same as: row_start = Ap[row];
-        const uint_fast64_t row_end   = ptrs[vector_lane][1];                   //same as: row_end   = Ap[row+1];
-
-        // initialize local Min/Max
-        float localMinMaxElement = minMaxInitializer;
-
-        if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
-        {
-            // ensure aligned memory access to Aj and Ax
-
-            uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
-
-            // accumulate local sums
-            if(jj >= row_start && jj < row_end) {
-				if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-			}
-
-            // accumulate local sums
-            for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
-        else
-        {
-            // accumulate local sums
-            for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
-                if(Minimize) {
-					localMinMaxElement = min(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
-				} else {
-					localMinMaxElement = max(localMinMaxElement, y[jj]);
-					//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
-				}
-        }
-
-        // store local sum in shared memory
-        sdata[threadIdx.x] = localMinMaxElement;
-        
-        // reduce local min/max to row min/max
-		if (Minimize) {
-			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
-
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  8]);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  4]);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  2]);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x +  1]);
-		} else {
-			/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
-			if (THREADS_PER_ROW >  8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
-			if (THREADS_PER_ROW >  4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
-			if (THREADS_PER_ROW >  2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
-			if (THREADS_PER_ROW >  1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
-			if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
-			if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
-			if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
-			if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
-			if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
-		}
-       
-        // first thread writes the result
-        if (thread_lane == 0)
-            x[row] = sdata[threadIdx.x];
-    }
-}
-
-template <bool Minimize, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
-{
-	float __minMaxInitializer = -std::numeric_limits<float>::max();
-	if (Minimize) {
-		__minMaxInitializer = std::numeric_limits<float>::max();
-	}
-	const float minMaxInitializer = __minMaxInitializer;
-
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-
-    storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
-}
-
-template <bool Minimize>
-void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
-{
-    const uint_fast64_t rows_per_group = num_entries / num_rows;
-
-    if (rows_per_group <=  2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <=  8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
-    
-    __storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
-}
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
-}
-
-void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    
-    __storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-
-void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-
-    __storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
-}
-
-// NON-OPT
-
-template <bool UseCache, unsigned int THREADS_PER_VECTOR>
-void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
-{
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-
-    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
-    
-    if (UseCache)
-        bind_x(x);
-
-    spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-
-    if (UseCache)
-        unbind_x(x);
-}
-
-void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    
-    __storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-}
-
-void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
-{
-    const uint_fast64_t nnz_per_row = num_entries / num_rows;
-
-    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
-
-    __storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
-}
-
-} // end namespace device
-} // end namespace detail
-} // end namespace cusp
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
@ -1,39 +0,0 @@
-#include <iostream>
-#include <chrono>
-
-__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) {
-	*B = *A;
-}
-
-void kernelSwitchTest(size_t N) {
-	int* deviceIntA;
-	int* deviceIntB;
-
-	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
-		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
-		return;
-	}
-	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
-		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
-		return;
-	}
-
-	// Allocate space on the device
-	auto start_time = std::chrono::high_resolution_clock::now();
-	for (int i = 0; i < N; ++i) {
-		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
-	}
-	auto end_time = std::chrono::high_resolution_clock::now();
-	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
-	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
-
-	// Free memory on device
-	if (cudaFree(deviceIntA) != cudaSuccess) {
-		std::cout << "Error in cudaFree!" << std::endl;
-		return;
-	}
-	if (cudaFree(deviceIntB) != cudaSuccess) {
-		std::cout << "Error in cudaFree!" << std::endl;
-		return;
-	}
-}
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
@ -1 +0,0 @@
-void kernelSwitchTest(size_t N);
--- a/resources/cudaForStorm/srcCuda/utility.cu
+++ b/resources/cudaForStorm/srcCuda/utility.cu
@ -1,33 +0,0 @@
-#include "utility.h"
-
-#include <cuda_runtime.h>
-
-size_t getFreeCudaMemory() {
-	size_t freeMemory;
-	size_t totalMemory;
-	cudaMemGetInfo(&freeMemory, &totalMemory);
-
-	return freeMemory;
-}
-
-size_t getTotalCudaMemory() {
-	size_t freeMemory;
-	size_t totalMemory;
-	cudaMemGetInfo(&freeMemory, &totalMemory);
-
-	return totalMemory;
-}
-
-bool resetCudaDevice() {
-	cudaError_t result = cudaDeviceReset();
-	return (result == cudaSuccess);
-}
-
-int getRuntimeCudaVersion() {
-	int result = -1;
-	cudaError_t errorResult = cudaRuntimeGetVersion(&result);
-	if (errorResult != cudaSuccess) {
-		return -1;
-	}
-	return result;
-}
--- a/resources/cudaForStorm/srcCuda/utility.h
+++ b/resources/cudaForStorm/srcCuda/utility.h
@ -1,12 +0,0 @@
-#ifndef STORM_CUDAFORSTORM_UTILITY_H_
-#define STORM_CUDAFORSTORM_UTILITY_H_
-
-// Library exports
-#include "cudaForStorm_Export.h"
-
-cudaForStorm_EXPORT size_t getFreeCudaMemory();
-cudaForStorm_EXPORT size_t getTotalCudaMemory();
-cudaForStorm_EXPORT bool resetCudaDevice();
-cudaForStorm_EXPORT int getRuntimeCudaVersion();
-
-#endif // STORM_CUDAFORSTORM_UTILITY_H_
--- a/resources/cudaForStorm/srcCuda/version.cu
+++ b/resources/cudaForStorm/srcCuda/version.cu
@ -1,28 +0,0 @@
-#include "version.h"
-
-#include "storm-cudaplugin-config.h"
-
-size_t getStormCudaPluginVersionMajor() {
-	return STORM_CUDAPLUGIN_VERSION_MAJOR;
-}
-
-size_t getStormCudaPluginVersionMinor() {
-	return STORM_CUDAPLUGIN_VERSION_MINOR;
-}
-
-size_t getStormCudaPluginVersionPatch() {
-	return STORM_CUDAPLUGIN_VERSION_PATCH;
-}
-
-size_t getStormCudaPluginVersionCommitsAhead() {
-	return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD;
-}
-
-const char* getStormCudaPluginVersionHash() {
-	static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH;
-	return versionHash.c_str();
-}
-
-bool getStormCudaPluginVersionIsDirty() {
-	return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0);
-}
--- a/resources/cudaForStorm/srcCuda/version.h
+++ b/resources/cudaForStorm/srcCuda/version.h
@ -1,16 +0,0 @@
-#ifndef STORM_CUDAFORSTORM_VERSION_H_
-#define STORM_CUDAFORSTORM_VERSION_H_
-
-// Library exports
-#include "cudaForStorm_Export.h"
-
-#include <string>
-
-cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor();
-cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor();
-cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch();
-cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead();
-cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash();
-cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty();
-
-#endif // STORM_CUDAFORSTORM_VERSION_H_
--- a/resources/cudaForStorm/storm-cudaplugin-config.h.in
+++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in
@ -1,21 +0,0 @@
-/*
- * StoRM - Build-in Options
- *
- * This file is parsed by CMake during makefile generation
- */
-
-#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
-#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
-
-// Version Information
-#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@					// The major version of StoRM
-#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@					// The minor version of StoRM
-#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@					// The patch version of StoRM
-#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ 	// How many commits passed since the tag was last set
-#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" 					// The short hash of the git commit this build is bases on
-#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ 					// 0 iff there no files were modified in the checkout, 1 else
-
-// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit
-#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
-
-#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
--- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h
+++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h
@ -15,7 +15,6 @@

 namespace storm {
 namespace modelchecker {
-namespace prctl {

 /*
 * An implementation of the SparseMdpPrctlModelChecker interface that uses topoligical value iteration for solving
@ -38,7 +37,7 @@ public:
 	 * Copy constructs a SparseMdpPrctlModelChecker from the given model checker. In particular, this means that the newly
 	 * constructed model checker will have the model of the given model checker as its associated model.
 	 */
-	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<Type> const& modelchecker)
+	explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<Type> const& modelchecker)
    : SparseMdpPrctlModelChecker<Type>(modelchecker) {
 		// Intentionally left empty.
 	}
@ -49,7 +48,6 @@ public:
 	virtual ~TopologicalValueIterationMdpPrctlModelChecker() { }
 };

-} // namespace prctl
 } // namespace modelchecker
 } // namespace storm

--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
@ -36,6 +36,12 @@ namespace storm {
 			this->maximalNumberOfIterations = settings.getMaximalIterationCount();
 			this->precision = settings.getPrecision();
 			this->relative = (settings.getConvergenceCriterion() == storm::settings::modules::TopologicalValueIterationEquationSolverSettings::ConvergenceCriterion::Relative);
+
+			auto generalSettings = storm::settings::generalSettings();
+			this->enableCuda = generalSettings.isCudaSet();
+#ifdef STORM_HAVE_CUDA
+			STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways.");
+#endif
        }
        
        template<typename ValueType>
--- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
+++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
@ -44,6 +44,8 @@ namespace storm {
            
            virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override;
 		private:
+
+			bool enableCuda;
 			/*!
 			 * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory.
 			 */
--- a/src/utility/cli.h
+++ b/src/utility/cli.h
@ -25,6 +25,10 @@
 #ifdef STORM_HAVE_MSAT
 #   include "mathsat.h"
 #endif
+#ifdef STORM_HAVE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif

 #include "log4cplus/logger.h"
 #include "log4cplus/loggingmacros.h"
@ -60,6 +64,7 @@ log4cplus::Logger printer;
 #include "src/modelchecker/prctl/SparseDtmcPrctlModelChecker.h"
 #include "src/modelchecker/reachability/SparseDtmcEliminationModelChecker.h"
 #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h"
+#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h"

 // Headers for counterexample generation.
 #include "src/counterexamples/MILPMinimalLabelSetGenerator.h"
@ -151,6 +156,44 @@ namespace storm {
                std::cout << "Linked with " << msatVersion << "." << std::endl;
                msat_free(msatVersion);
 #endif
+#ifdef STORM_HAVE_CUDA
+				int deviceCount = 0;
+				cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+				if (error_id == cudaSuccess)
+				{
+					std::cout << "Compiled with CUDA support, ";
+					// This function call returns 0 if there are no CUDA capable devices.
+					if (deviceCount == 0)
+					{
+						std::cout<< "but there are no available device(s) that support CUDA." << std::endl;
+					} else
+					{
+						std::cout << "detected " << deviceCount << " CUDA Capable device(s):" << std::endl;
+					}
+
+					int dev, driverVersion = 0, runtimeVersion = 0;
+
+					for (dev = 0; dev < deviceCount; ++dev)
+					{
+						cudaSetDevice(dev);
+						cudaDeviceProp deviceProp;
+						cudaGetDeviceProperties(&deviceProp, dev);
+
+						std::cout << "CUDA Device " << dev << ": \"" << deviceProp.name << "\"" << std::endl;
+
+						// Console log
+						cudaDriverGetVersion(&driverVersion);
+						cudaRuntimeGetVersion(&runtimeVersion);
+						std::cout << "  CUDA Driver Version / Runtime Version          " << driverVersion / 1000 << "." << (driverVersion % 100) / 10 << " / " << runtimeVersion / 1000 << "." << (runtimeVersion % 100) / 10 << std::endl;
+						std::cout << "  CUDA Capability Major/Minor version number:    " << deviceProp.major<<"."<<deviceProp.minor <<std::endl;
+					}
+					std::cout << std::endl;
+				}
+				else {
+					std::cout << "Compiled with CUDA support, but an error occured trying to find CUDA devices." << std::endl;
+				}
+#endif
                
                // "Compute" the command line argument string with which STORM was invoked.
                std::stringstream commandStream;
@ -362,9 +405,19 @@ namespace storm {
                            }
                        }
                    } else if (model->getType() == storm::models::MDP) {
-                        std::shared_ptr<storm::models::Mdp<ValueType>> mdp = model->template as<storm::models::Mdp<ValueType>>();
-                        storm::modelchecker::SparseMdpPrctlModelChecker<ValueType> modelchecker(*mdp);
-                        result = modelchecker.check(*formula.get());
+						std::shared_ptr<storm::models::Mdp<ValueType>> mdp = model->template as<storm::models::Mdp<ValueType>>();
+#ifdef STORM_HAVE_CUDA
+						if (settings.isCudaSet()) {
+							storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<ValueType> modelchecker(*mdp);
+							result = modelchecker.check(*formula.get());
+						} else {
+							storm::modelchecker::SparseMdpPrctlModelChecker<ValueType> modelchecker(*mdp);
+							result = modelchecker.check(*formula.get());
+						}
+#else
+						storm::modelchecker::SparseMdpPrctlModelChecker<ValueType> modelchecker(*mdp);
+						result = modelchecker.check(*formula.get());
+#endif
                    }
                    
                    if (result) {
--- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
+++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -20,7 +20,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	ASSERT_EQ(mdp->getNumberOfStates(), 169ull);
 	ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull);
    
-	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+	storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
    
 	//storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("two");
 	auto apFormula = std::make_shared<storm::logic::AtomicLabelFormula>("two");
@ -138,7 +138,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	// ------------- state rewards --------------
 	std::shared_ptr<storm::models::Mdp<double>> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as<storm::models::Mdp<double>>();
    
-	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateRewardModelChecker(*stateRewardMdp);
+	storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<double> stateRewardModelChecker(*stateRewardMdp);


 	apFormula = std::make_shared<storm::logic::AtomicLabelFormula>("done");
@ -174,7 +174,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) {
 	// -------------------------------- state and transition reward ------------------------
 	std::shared_ptr<storm::models::Mdp<double>> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as<storm::models::Mdp<double>>();
    
-	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp);
+	storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<double> stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp);
    

 	apFormula = std::make_shared<storm::logic::AtomicLabelFormula>("done");
@ -214,7 +214,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) {
 	ASSERT_EQ(mdp->getNumberOfStates(), 3172ull);
 	ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull);

-	storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);
+	storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp);

 	auto apFormula = std::make_shared<storm::logic::AtomicLabelFormula>("elected");
 	auto eventuallyFormula = std::make_shared<storm::logic::EventuallyFormula>(apFormula);