Compiles with CUSP :)

Former-commit-id: 78555303bf
11 years ago · 71e077f420
8 changed files with 337 additions and 129 deletions
--- a/resources/cmake/FindCusp.cmake
+++ b/resources/cmake/FindCusp.cmake
@ -0,0 +1,55 @@
+#
+# FindCusp
+#
+# This module finds the CUSP header files and extracts their version.  It
+# sets the following variables.
+#
+# CUSP_INCLUDE_DIR -  Include directory for cusp header files.  (All header
+#                       files will actually be in the cusp subdirectory.)
+# CUSP_VERSION -      Version of cusp in the form "major.minor.patch".
+#
+# CUSP_FOUND - Indicates whether Cusp has been found
+#
+
+find_path(CUSP_INCLUDE_DIR
+	HINTS
+		/usr/include/cusp
+		/usr/local/include
+		/usr/local/cusp/include
+		${CUSP_INCLUDE_DIRS}
+		${CUSP_HINT}
+	NAMES cusp/version.h
+	DOC "Cusp headers"
+)
+if(CUSP_INCLUDE_DIR)
+	list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR)
+endif(CUSP_INCLUDE_DIR)
+
+# Find cusp version
+file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h
+	version
+	REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)"
+)
+string(REGEX REPLACE
+	"#define CUSP_VERSION[ \t]+"
+	""
+	version
+	"${version}"
+)
+
+#define CUSP_MAJOR_VERSION     (CUSP_VERSION / 100000)
+#define CUSP_MINOR_VERSION     (CUSP_VERSION / 100 % 1000)
+#define CUSP_SUBMINOR_VERSION  (CUSP_VERSION % 100)
+
+math(EXPR CUSP_MAJOR_VERSION "${version} / 100000")
+math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000")
+math(EXPR CUSP_PATCH_VERSION "${version} % 100")
+
+set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}")
+
+# Check for required components
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION)
+
+set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR})
+mark_as_advanced(CUSP_INCLUDE_DIR)
--- a/resources/cmake/FindThrust.cmake
+++ b/resources/cmake/FindThrust.cmake
@ -1,53 +1,52 @@
 #
 # FindThrust
 #
-# This module finds the Thrust header files and extrats their version.  It
+# This module finds the Thrust header files and extracts their version.  It
 # sets the following variables.
 #
 # THRUST_INCLUDE_DIR -  Include directory for thrust header files.  (All header
 #                       files will actually be in the thrust subdirectory.)
 # THRUST_VERSION -      Version of thrust in the form "major.minor.patch".
 #
+# Thrust_FOUND - Indicates whether Thrust has been found
+#

-find_path( THRUST_INCLUDE_DIR
-  HINTS
-    /usr/include/cuda
-    /usr/local/include
-    /usr/local/cuda/include
-    ${CUDA_INCLUDE_DIRS}
-  NAMES thrust/version.h
-  DOC "Thrust headers"
-  )
-if( THRUST_INCLUDE_DIR )
-  list( REMOVE_DUPLICATES THRUST_INCLUDE_DIR )
-endif( THRUST_INCLUDE_DIR )
+find_path(THRUST_INCLUDE_DIR
+	HINTS
+		/usr/include/cuda
+		/usr/local/include
+		/usr/local/cuda/include
+		${CUDA_INCLUDE_DIRS}
+	NAMES thrust/version.h
+	DOC "Thrust headers"
+)
+if(THRUST_INCLUDE_DIR)
+	list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR)
+endif(THRUST_INCLUDE_DIR)

 # Find thrust version
-file( STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
-  version
-  REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
-  )
-string( REGEX REPLACE
-  "#define THRUST_VERSION[ \t]+"
-  ""
-  version
-  "${version}"
-  )
+file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
+	version
+	REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
+)
+string(REGEX REPLACE
+	"#define THRUST_VERSION[ \t]+"
+	""
+	version
+	"${version}"
+)

-string( REGEX MATCH "^[0-9]" major ${version} )
-string( REGEX REPLACE "^${major}00" "" version "${version}" )
-string( REGEX MATCH "^[0-9]" minor ${version} )
-string( REGEX REPLACE "^${minor}0" "" version "${version}" )
-set( THRUST_VERSION "${major}.${minor}.${version}")
-set( THRUST_MAJOR_VERSION "${major}")
-set( THRUST_MINOR_VERSION "${minor}")
+string(REGEX MATCH "^[0-9]" major ${version})
+string(REGEX REPLACE "^${major}00" "" version "${version}")
+string(REGEX MATCH "^[0-9]" minor ${version})
+string(REGEX REPLACE "^${minor}0" "" version "${version}")
+set(THRUST_VERSION "${major}.${minor}.${version}")
+set(THRUST_MAJOR_VERSION "${major}")
+set(THRUST_MINOR_VERSION "${minor}")

 # Check for required components
-include( FindPackageHandleStandardArgs )
-find_package_handle_standard_args( Thrust
-  REQUIRED_VARS THRUST_INCLUDE_DIR
-  VERSION_VAR THRUST_VERSION
-  )
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION)

 set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR})
 mark_as_advanced(THRUST_INCLUDE_DIR)
--- a/resources/cudaForStorm/CMakeAlignmentCheck.cpp
+++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp
@ -0,0 +1,64 @@
+/*
+ * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
+ */
+ #include <cstdint>
+ #include <utility>
+ #include <vector>
+ 
+ #define CONTAINER_SIZE 100ul
+ 
+ template <typename IndexType, typename ValueType>
+ int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
+	std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
+	for (size_t i = 0; i < containerSize; ++i) {
+		myVector->push_back(std::make_pair(firstValue, secondValue));
+	}
+	size_t myVectorSize = myVector->size();
+	IndexType* firstStart = &(myVector->at(0).first);
+	IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
+	ValueType* secondStart = &(myVector->at(0).second);
+	ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
+	size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
+	size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
+	size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
+	size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
+	
+	delete myVector;
+	myVector = nullptr;
+	
+	if (myVectorSize != containerSize) {
+		return -2;
+	}
+	
+	// Check for alignment:
+	// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
+	if (sizeof(IndexType) != sizeof(ValueType)) {
+		return -3;
+	}
+	if (startOffset != sizeof(IndexType)) {
+		return -4;
+	}
+	if (endOffset != sizeof(IndexType)) {
+		return -5;
+	}
+	if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
+		return -6;
+	}
+	if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
+		return -7;
+	}
+	
+	return 0;
+ }
+ 
+ 
+ int main(int argc, char* argv[]) {
+	int result = 0;
+	
+	result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
+	if (result != 0) {
+		return result;
+	}
+	
+	return 0;
+ }
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -35,9 +35,12 @@ set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE
 # Add the resources/cmake folder to Module Search Path for FindTBB.cmake
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")

+# Set the hint for CUSP
+set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary")
+
 find_package(CUDA REQUIRED)
+find_package(Cusp REQUIRED)
 find_package(Doxygen REQUIRED)
-find_package(Threads REQUIRED)
 find_package(Thrust REQUIRED)

 # If the DEBUG option was turned on, we will target a debug version and a release version otherwise
@ -115,10 +118,23 @@ endif()
 ##
 #############################################################

+# Test for type alignment
+try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
+	${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp"
+	COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
+)
+if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
+elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
+	message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
+endif()
+  
 # Configure a header file to pass some of the CMake settings to the source code
 configure_file (
-	"${PROJECT_SOURCE_DIR}/../../storm-config.h.in"
-	"${PROJECT_BINARY_DIR}/include/storm-config.h"
+	"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in"
+	"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
 )
 # Add the binary dir include directory for storm-config.h
 include_directories("${PROJECT_BINARY_DIR}/include")
@ -161,22 +177,6 @@ endif(ADDITIONAL_LINK_DIRS)
 #############################################################


-###############################################################################
-##                                                                            #
-##	Executable Creation                                                       #
-##                                                                            #
-##  All link_directories() calls MUST be made before this point               #
-##                                                                            #
-###############################################################################
-include (GenerateExportHeader)
-
-#add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
-#GENERATE_EXPORT_HEADER( cudaForStorm
-#             BASE_NAME cudaForStorm
-#             EXPORT_MACRO_NAME cudaForStorm_EXPORT
-#             EXPORT_FILE_NAME include/cudaForStorm_Export.h
-#             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
-#)

 #############################################################
 ##
@ -184,18 +184,6 @@ include (GenerateExportHeader)
 ##
 #############################################################
 #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
-cuda_add_library(cudaForStorm SHARED
-  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
-  OPTIONS -DSTUFF="" -arch=sm_30
-  RELEASE -DNDEBUG
-  DEBUG -g -DDEBUG
-)
-GENERATE_EXPORT_HEADER( cudaForStorm
-             BASE_NAME cudaForStorm
-             EXPORT_MACRO_NAME cudaForStorm_EXPORT
-             EXPORT_FILE_NAME include/cudaForStorm_Export.h
-             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
-)
 #target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
 #ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
 #target_link_libraries(cudaForStorm cudaLibrary)
@ -204,19 +192,52 @@ include_directories(${CUDA_INCLUDE_DIRS})

 #############################################################
 ##
-##	Threads
+##	CUSP
 ##
 #############################################################
-include_directories(${THREADS_INCLUDE_DIRS})
-target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT})
+if(CUSP_FOUND)
+	include_directories(${CUSP_INCLUDE_DIR})
+	cuda_include_directories(${CUSP_INCLUDE_DIR})
+	message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
+endif()

 #############################################################
 ##
 ##	Thrust
 ##
 #############################################################
-include_directories(${THRUST_INCLUDE_DIR})
-message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION}")
+if(THRUST_FOUND)
+	include_directories(${THRUST_INCLUDE_DIR})
+	cuda_include_directories(${THRUST_INCLUDE_DIR})
+	message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
+else()
+	message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
+endif()
+
+###############################################################################
+##                                                                            #
+##	Executable Creation                                                       #
+##                                                                            #
+##  All link_directories() calls AND include_directories() calls              #
+##  MUST be made before this point                                            #
+##                                                                            #
+###############################################################################
+include (GenerateExportHeader)
+
+cuda_add_library(cudaForStorm SHARED
+  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
+  OPTIONS -DSTUFF="" -arch=sm_30
+  RELEASE -DNDEBUG
+  DEBUG -g -DDEBUG
+)
+GENERATE_EXPORT_HEADER( cudaForStorm
+             BASE_NAME cudaForStorm
+             EXPORT_MACRO_NAME cudaForStorm_EXPORT
+             EXPORT_FILE_NAME include/cudaForStorm_Export.h
+             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
+)

 if (MSVC)
 	# Add the DebugHelper DLL
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -6,61 +6,54 @@
 #include <cuda_runtime.h>
 #include "cusparse_v2.h"

+#include "cuspExtension.h"

 __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
 	*B = *A;
 }

-void cudaForStormTestFunction(int a, int b) {
-	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
-}
-
-void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
-	if (sizeof(double) != sizeof(uint_fast64_t)) {
-		std::cout << "FATAL ERROR - Internal Sizes of Double and uint_fast64_t do NOT match, CUDA acceleration not possible!" << std::endl;
-		return;
-	}
-	
-	uint_fast64_t* device_matrixRowIndices = nullptr;
-	uint_fast64_t* device_matrixColIndicesAndValues = nullptr;
-	double* device_x = nullptr;
-	double* device_b = nullptr;
-	double* device_multiplyResult = nullptr;
-	uint_fast64_t* device_nondeterministicChoiceIndices = nullptr;
+template <typename IndexType, typename ValueType>
+void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
+	IndexType* device_matrixRowIndices = nullptr;
+	IndexType* device_matrixColIndicesAndValues = nullptr;
+	ValueType* device_x = nullptr;
+	ValueType* device_b = nullptr;
+	ValueType* device_multiplyResult = nullptr;
+	IndexType* device_nondeterministicChoiceIndices = nullptr;

 	cudaError_t cudaMallocResult;

-	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixRowIndices, matrixRowIndices.size());
+	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixRowIndices, matrixRowIndices.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
+	cudaMallocResult = cudaMalloc<IndexType>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<double>(&device_x, x.size());
+	cudaMallocResult = cudaMalloc<ValueType>(&device_x, x.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<double>(&device_b, b.size());
+	cudaMallocResult = cudaMalloc<ValueType>(&device_b, b.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<double>(&device_multiplyResult, b.size());
+	cudaMallocResult = cudaMalloc<ValueType>(&device_multiplyResult, b.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

-	cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
+	cudaMallocResult = cudaMalloc<IndexType>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
 	if (cudaMallocResult != cudaSuccess) {
 		std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
@ -69,31 +62,31 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
 	// Memory allocated, copy data to device
 	cudaError_t cudaCopyResult;

-	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(uint_fast64_t) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(uint_fast64_t) * columnIndicesAndValues.size()) + (sizeof(double) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(double) * x.size(), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(double) * b.size(), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

-	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(uint_fast64_t) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
 		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
@ -101,6 +94,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve

 	// Data is on device, start Kernel

+
 	// All code related to freeing memory and clearing up the device
 cleanup:
 	if (device_matrixRowIndices != nullptr) {
@ -148,35 +142,14 @@ cleanup:
 }

 /*
-void kernelSwitchTest(size_t N) {
-	int* deviceIntA;
-	int* deviceIntB;
+ * Declare and implement all exported functions for these Kernels here
+ *
+ */

-	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
-		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
-		return;
-	}
-	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
-		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
-		return;
-	}
-
-	// Allocate space on the device
-	auto start_time = std::chrono::high_resolution_clock::now();
-	for (int i = 0; i < N; ++i) {
-		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
-	}
-	auto end_time = std::chrono::high_resolution_clock::now();
-	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
-	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
+void cudaForStormTestFunction(int a, int b) {
+	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
+}

-	// Free memory on device
-	if (cudaFree(deviceIntA) != cudaSuccess) {
-		std::cout << "Error in cudaFree!" << std::endl;
-		return;
-	}
-	if (cudaFree(deviceIntB) != cudaSuccess) {
-		std::cout << "Error in cudaFree!" << std::endl;
-		return;
-	}
-}*/
+void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
+	basicValueIteration_mvReduce<uint_fast64_t, double>(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
+}
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -6,4 +6,4 @@
 #include "cudaForStorm_Export.h"

 cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
-cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
+cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -0,0 +1,83 @@
+/*
+ * This is an extension of the original CUSP csr_vector.h SPMV implementation.
+ * It is based on the Code and incorporates changes as to cope with the details
+ * of the StoRM code.
+ * As this is mostly copy & past, the original license still applies.
+ */
+
+/*
+ *  Copyright 2008-2009 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <cusp/detail/device/spmv/csr_vector.h>
+
+namespace cusp
+{
+namespace detail
+{
+namespace device
+{
+
+template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
+void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+{
+    const size_t THREADS_PER_BLOCK  = 128;
+    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
+
+    const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
+    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
+    
+    if (UseCache)
+        bind_x(x);
+
+    spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+
+    if (UseCache)
+        unbind_x(x);
+}
+
+template <typename IndexType, typename ValueType>
+void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+{
+    const IndexType nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    
+    __storm_cuda_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+template <typename IndexType, typename ValueType>
+void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
+{
+    const IndexType nnz_per_row = num_entries / num_rows;
+
+    if (nnz_per_row <=  2) { __storm_cuda_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+
+    __storm_cuda_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
+}
+
+} // end namespace device
+} // end namespace detail
+} // end namespace cusp
--- a/resources/cudaForStorm/storm-cudaplugin-config.h.in
+++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in
@ -0,0 +1,13 @@
+/*
+ * StoRM - Build-in Options
+ *
+ * This file is parsed by CMake during makefile generation
+ * It contains information such as the base path to the test/example data
+ */
+
+#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
+#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
+
+
+
+#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_