Compiles with CUSP :)

Former-commit-id: 78555303bf
12 years ago · 71e077f420
8 changed files with 337 additions and 129 deletions
--- a/resources/cmake/FindCusp.cmake
+++ b/resources/cmake/FindCusp.cmake
@ -0,0 +1,55 @@
 #
 # FindCusp
 #
 # This module finds the CUSP header files and extracts their version.  It
 # sets the following variables.
 #
 # CUSP_INCLUDE_DIR -  Include directory for cusp header files.  (All header
 #                       files will actually be in the cusp subdirectory.)
 # CUSP_VERSION -      Version of cusp in the form "major.minor.patch".
 #
 # CUSP_FOUND - Indicates whether Cusp has been found
 #
 find_path(CUSP_INCLUDE_DIR
 	HINTS
 		/usr/include/cusp
 		/usr/local/include
 		/usr/local/cusp/include
 		${CUSP_INCLUDE_DIRS}
 		${CUSP_HINT}
 	NAMES cusp/version.h
 	DOC "Cusp headers"
 )
 if(CUSP_INCLUDE_DIR)
 	list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR)
 endif(CUSP_INCLUDE_DIR)
 # Find cusp version
 file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h
 	version
 	REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)"
 )
 string(REGEX REPLACE
 	"#define CUSP_VERSION[ \t]+"
 	""
 	version
 	"${version}"
 )
 #define CUSP_MAJOR_VERSION     (CUSP_VERSION / 100000)
 #define CUSP_MINOR_VERSION     (CUSP_VERSION / 100 % 1000)
 #define CUSP_SUBMINOR_VERSION  (CUSP_VERSION % 100)
 math(EXPR CUSP_MAJOR_VERSION "${version} / 100000")
 math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000")
 math(EXPR CUSP_PATCH_VERSION "${version} % 100")
 set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}")
 # Check for required components
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION)
 set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR})
 mark_as_advanced(CUSP_INCLUDE_DIR)
--- a/resources/cmake/FindThrust.cmake
+++ b/resources/cmake/FindThrust.cmake
@ -1,53 +1,52 @@
 #
 # FindThrust
 #
 # This module finds the Thrust header files and extrats their version.  It
 # This module finds the Thrust header files and extracts their version.  It
 # sets the following variables.
 #
 # THRUST_INCLUDE_DIR -  Include directory for thrust header files.  (All header
 #                       files will actually be in the thrust subdirectory.)
 # THRUST_VERSION -      Version of thrust in the form "major.minor.patch".
 #
 # Thrust_FOUND - Indicates whether Thrust has been found
 #
 find_path( THRUST_INCLUDE_DIR
  HINTS
    /usr/include/cuda
    /usr/local/include
    /usr/local/cuda/include
    ${CUDA_INCLUDE_DIRS}
  NAMES thrust/version.h
  DOC "Thrust headers"
  )
 if( THRUST_INCLUDE_DIR )
  list( REMOVE_DUPLICATES THRUST_INCLUDE_DIR )
 endif( THRUST_INCLUDE_DIR )
 find_path(THRUST_INCLUDE_DIR
 	HINTS
 		/usr/include/cuda
 		/usr/local/include
 		/usr/local/cuda/include
 		${CUDA_INCLUDE_DIRS}
 	NAMES thrust/version.h
 	DOC "Thrust headers"
 )
 if(THRUST_INCLUDE_DIR)
 	list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR)
 endif(THRUST_INCLUDE_DIR)
 # Find thrust version
 file( STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
  version
  REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
  )
 string( REGEX REPLACE
  "#define THRUST_VERSION[ \t]+"
  ""
  version
  "${version}"
  )
 file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
 	version
 	REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
 )
 string(REGEX REPLACE
 	"#define THRUST_VERSION[ \t]+"
 	""
 	version
 	"${version}"
 )
 string( REGEX MATCH "^[0-9]" major ${version} )
 string( REGEX REPLACE "^${major}00" "" version "${version}" )
 string( REGEX MATCH "^[0-9]" minor ${version} )
 string( REGEX REPLACE "^${minor}0" "" version "${version}" )
 set( THRUST_VERSION "${major}.${minor}.${version}")
 set( THRUST_MAJOR_VERSION "${major}")
 set( THRUST_MINOR_VERSION "${minor}")
 string(REGEX MATCH "^[0-9]" major ${version})
 string(REGEX REPLACE "^${major}00" "" version "${version}")
 string(REGEX MATCH "^[0-9]" minor ${version})
 string(REGEX REPLACE "^${minor}0" "" version "${version}")
 set(THRUST_VERSION "${major}.${minor}.${version}")
 set(THRUST_MAJOR_VERSION "${major}")
 set(THRUST_MINOR_VERSION "${minor}")
 # Check for required components
 include( FindPackageHandleStandardArgs )
 find_package_handle_standard_args( Thrust
  REQUIRED_VARS THRUST_INCLUDE_DIR
  VERSION_VAR THRUST_VERSION
  )
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION)
 set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR})
 mark_as_advanced(THRUST_INCLUDE_DIR)
--- a/resources/cudaForStorm/CMakeAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp
@ -0,0 +1,64 @@
 /*
 * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
 */
 #include <cstdint>
 #include <utility>
 #include <vector>
 #define CONTAINER_SIZE 100ul
 template <typename IndexType, typename ValueType>
 int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
 	std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
 	for (size_t i = 0; i < containerSize; ++i) {
 		myVector->push_back(std::make_pair(firstValue, secondValue));
 	}
 	size_t myVectorSize = myVector->size();
 	IndexType* firstStart = &(myVector->at(0).first);
 	IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
 	ValueType* secondStart = &(myVector->at(0).second);
 	ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
 	size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
 	size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
 	size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
 	size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
 	delete myVector;
 	myVector = nullptr;
 	if (myVectorSize != containerSize) {
 		return -2;
 	}
 	// Check for alignment:
 	// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
 	if (sizeof(IndexType) != sizeof(ValueType)) {
 		return -3;
 	}
 	if (startOffset != sizeof(IndexType)) {
 		return -4;
 	}
 	if (endOffset != sizeof(IndexType)) {
 		return -5;
 	}
 	if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
 		return -6;
 	}
 	if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
 		return -7;
 	}
 	return 0;
 }
 int main(int argc, char* argv[]) {
 	int result = 0;
 	result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
 	if (result != 0) {
 		return result;
 	}
 	return 0;
 }
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -35,9 +35,12 @@ set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE
 # Add the resources/cmake folder to Module Search Path for FindTBB.cmake
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")
 # Set the hint for CUSP
 set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary")
 find_package(CUDA REQUIRED)
 find_package(Cusp REQUIRED)
 find_package(Doxygen REQUIRED)
 find_package(Threads REQUIRED)
 find_package(Thrust REQUIRED)
 # If the DEBUG option was turned on, we will target a debug version and a release version otherwise
@ -115,10 +118,23 @@ endif()
 ##
 #############################################################
 # Test for type alignment
 try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
 	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp"
 	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
 )
 if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
 	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
 elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
 	message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
 else()
 	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
 endif()
 # Configure a header file to pass some of the CMake settings to the source code
 configure_file (
 	"${PROJECT_SOURCE_DIR}/../../storm-config.h.in"
 	"${PROJECT_BINARY_DIR}/include/storm-config.h"
 	"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in"
 	"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
 )
 # Add the binary dir include directory for storm-config.h
 include_directories("${PROJECT_BINARY_DIR}/include")
@ -161,22 +177,6 @@ endif(ADDITIONAL_LINK_DIRS)
 #############################################################
 ###############################################################################
 ##                                                                            #
 ##	Executable Creation                                                       #
 ##                                                                            #
 ##  All link_directories() calls MUST be made before this point               #
 ##                                                                            #
 ###############################################################################
 include (GenerateExportHeader)
 #add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
 #GENERATE_EXPORT_HEADER( cudaForStorm
 #             BASE_NAME cudaForStorm
 #             EXPORT_MACRO_NAME cudaForStorm_EXPORT
 #             EXPORT_FILE_NAME include/cudaForStorm_Export.h
 #             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
 #)
 #############################################################
 ##
@ -184,18 +184,6 @@ include (GenerateExportHeader)
 ##
 #############################################################
 #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
 cuda_add_library(cudaForStorm SHARED
  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
  OPTIONS -DSTUFF="" -arch=sm_30
  RELEASE -DNDEBUG
  DEBUG -g -DDEBUG
 )
 GENERATE_EXPORT_HEADER( cudaForStorm
             BASE_NAME cudaForStorm
             EXPORT_MACRO_NAME cudaForStorm_EXPORT
             EXPORT_FILE_NAME include/cudaForStorm_Export.h
             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
 )
 #target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
 #ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
 #target_link_libraries(cudaForStorm cudaLibrary)
@ -204,19 +192,52 @@ include_directories(${CUDA_INCLUDE_DIRS})
 #############################################################
 ##
 ##	Threads
 ##	CUSP
 ##
 #############################################################
 include_directories(${THREADS_INCLUDE_DIRS})
 target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT})
 if(CUSP_FOUND)
 	include_directories(${CUSP_INCLUDE_DIR})
 	cuda_include_directories(${CUSP_INCLUDE_DIR})
 	message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
 else()
 	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
 endif()
 #############################################################
 ##
 ##	Thrust
 ##
 #############################################################
 include_directories(${THRUST_INCLUDE_DIR})
 message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION}")
 if(THRUST_FOUND)
 	include_directories(${THRUST_INCLUDE_DIR})
 	cuda_include_directories(${THRUST_INCLUDE_DIR})
 	message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
 else()
 	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
 endif()
 ###############################################################################
 ##                                                                            #
 ##	Executable Creation                                                       #
 ##                                                                            #
 ##  All link_directories() calls AND include_directories() calls              #
 ##  MUST be made before this point                                            #
 ##                                                                            #
 ###############################################################################
 include (GenerateExportHeader)
 cuda_add_library(cudaForStorm SHARED
  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
  OPTIONS -DSTUFF="" -arch=sm_30
  RELEASE -DNDEBUG
  DEBUG -g -DDEBUG
 )
 GENERATE_EXPORT_HEADER( cudaForStorm
             BASE_NAME cudaForStorm
             EXPORT_MACRO_NAME cudaForStorm_EXPORT
             EXPORT_FILE_NAME include/cudaForStorm_Export.h
             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
 )
 if (MSVC)
 	# Add the DebugHelper DLL
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -6,61 +6,54 @@
 #include <cuda_runtime.h>
 #include "cusparse_v2.h"
 #include "cuspExtension.h"
 __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
 	*B = *A;
 }
 void cudaForStormTestFunction(int a, int b) {
 	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
 }
 void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
 	if (sizeof(double) != sizeof(uint_fast64_t)) {
 		std::cout << "FATAL ERROR - Internal Sizes of Double and uint_fast64_t do NOT match, CUDA acceleration not possible!" << std::endl;
 		return;
 	}
 	uint_fast64_t* device_matrixRowIndices = nullptr;
 	uint_fast64_t* device_matrixColIndicesAndValues = nullptr;
 	double* device_x = nullptr;
 	double* device_b = nullptr;
 	double* device_multiplyResult = nullptr;
 	uint_fast64_t* device_nondeterministicChoiceIndices = nullptr;
 template <typename IndexType, typename ValueType>
 void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
 	IndexType* device_matrixRowIndices = nullptr;
 	IndexType* device_matrixColIndicesAndValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_b = nullptr;
 	ValueType* device_multiplyResult = nullptr;
 	IndexType* device_nondeterministicChoiceIndices = nullptr;
 	cudaError_t cudaMallocResult;
 	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixRowIndices, matrixRowIndices.size());
 	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixRowIndices, matrixRowIndices.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
 	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	cudaMallocResult = cudaMalloc<double>(&device_x, x.size());
 	cudaMallocResult = cudaMalloc<ValueType>(&device_x, x.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	cudaMallocResult = cudaMalloc<double>(&device_b, b.size());
 	cudaMallocResult = cudaMalloc<ValueType>(&device_b, b.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	cudaMallocResult = cudaMalloc<double>(&device_multiplyResult, b.size());
 	cudaMallocResult = cudaMalloc<ValueType>(&device_multiplyResult, b.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}
 	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
 	cudaMallocResult = cudaMalloc<IndexType>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
@ -69,31 +62,31 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;
 	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(uint_fast64_t) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
 	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(uint_fast64_t) * columnIndicesAndValues.size()) + (sizeof(double) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
 	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(double) * x.size(), cudaMemcpyHostToDevice);
 	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(double) * b.size(), cudaMemcpyHostToDevice);
 	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}
 	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(uint_fast64_t) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
 	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
@ -101,6 +94,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
 	// Data is on device, start Kernel
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
@ -148,35 +142,14 @@ cleanup:
 }
 /*
 void kernelSwitchTest(size_t N) {
 	int* deviceIntA;
 	int* deviceIntB;
 * Declare and implement all exported functions for these Kernels here
 *
 */
 	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
 		return;
 	}
 	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
 		return;
 	}
 	// Allocate space on the device
 	auto start_time = std::chrono::high_resolution_clock::now();
 	for (int i = 0; i < N; ++i) {
 		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
 	}
 	auto end_time = std::chrono::high_resolution_clock::now();
 	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
 	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
 void cudaForStormTestFunction(int a, int b) {
 	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
 }
 	// Free memory on device
 	if (cudaFree(deviceIntA) != cudaSuccess) {
 		std::cout << "Error in cudaFree!" << std::endl;
 		return;
 	}
 	if (cudaFree(deviceIntB) != cudaSuccess) {
 		std::cout << "Error in cudaFree!" << std::endl;
 		return;
 	}
 }*/
 void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
 	basicValueIteration_mvReduce<uint_fast64_t, double>(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
 }
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -6,4 +6,4 @@
 #include "cudaForStorm_Export.h"
 cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -0,0 +1,83 @@
 /*
 * This is an extension of the original CUSP csr_vector.h SPMV implementation.
 * It is based on the Code and incorporates changes as to cope with the details
 * of the StoRM code.
 * As this is mostly copy & past, the original license still applies.
 */
 /*
 *  Copyright 2008-2009 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 #pragma once
 #include <cusp/detail/device/spmv/csr_vector.h>
 namespace cusp
 {
 namespace detail
 {
 namespace device
 {
 template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
 void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
    if (UseCache)
        bind_x(x);
    spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
    if (UseCache)
        unbind_x(x);
 }
 template <typename IndexType, typename ValueType>
 void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 template <typename IndexType, typename ValueType>
 void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;
    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    __storm_cuda_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }
 } // end namespace device
 } // end namespace detail
 } // end namespace cusp
--- a/resources/cudaForStorm/storm-cudaplugin-config.h.in
+++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in
@ -0,0 +1,13 @@
 /*
 * StoRM - Build-in Options
 *
 * This file is parsed by CMake during makefile generation
 * It contains information such as the base path to the test/example data
 */
 #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
 #define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
 #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_