Added first signs of the CUDA Extension for Storm.

Former-commit-id: b02385cd82
13 years ago · d5828043de
12 changed files with 881 additions and 0 deletions
--- a/resources/cudaForStorm/CMakeLists.txt
+++ b/resources/cudaForStorm/CMakeLists.txt
@ -0,0 +1,334 @@
+cmake_minimum_required (VERSION 2.8.6)
+
+# Set project name
+project (cudaForStorm CXX C)
+
+# Set the version number
+set (STORM_CPP_VERSION_MAJOR 1)
+set (STORM_CPP_VERSION_MINOR 0)
+
+# Add base folder for better inclusion paths
+include_directories("${PROJECT_SOURCE_DIR}")
+include_directories("${PROJECT_SOURCE_DIR}/src")
+
+message(STATUS "CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}")
+
+#############################################################
+##
+##	CMake options of StoRM
+##
+#############################################################
+option(DEBUG "Sets whether the DEBUG mode is used" ON)
+option(USE_POPCNT "Sets whether the popcnt instruction is going to be used." ON)
+option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF)
+option(USE_LIBCXX "Sets whether the standard library is libc++." OFF)
+option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF)
+set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).")
+set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).")
+set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.")
+set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.")
+
+#############################################################
+##
+##	Inclusion of required libraries
+##
+#############################################################
+
+# Add the resources/cmake folder to Module Search Path for FindTBB.cmake
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")
+
+find_package(CUDA REQUIRED)
+find_package(Doxygen REQUIRED)
+find_package(Threads REQUIRED)
+
+# If the DEBUG option was turned on, we will target a debug version and a release version otherwise
+if (DEBUG)
+    set (CMAKE_BUILD_TYPE "DEBUG")
+else()
+    set (CMAKE_BUILD_TYPE "RELEASE")
+endif()
+message(STATUS "StoRM - Building ${CMAKE_BUILD_TYPE} version.")
+
+if ("${GUROBI_ROOT}" STREQUAL "")
+    set(ENABLE_GUROBI OFF)
+else()
+    set(ENABLE_GUROBI ON)
+endif()
+
+if ("${Z3_ROOT}" STREQUAL "")
+    set(ENABLE_Z3 OFF)
+else()
+    set(ENABLE_Z3 ON)
+	set(Z3_LIB_NAME "z3")
+endif()
+
+message(STATUS "StoRM - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
+message(STATUS "StoRM - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}")
+
+#############################################################
+##
+##	CUDA Options
+##
+#############################################################
+SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE)
+set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
+set(BUILD_SHARED_LIBS OFF)
+set(CUDA_SEPARABLE_COMPILATION ON)
+#set(CUDA_NVCC_FLAGS "-arch=sm_30")
+
+#############################################################
+##
+##	Compiler specific settings and definitions
+##
+#############################################################
+
+# Path to the no-strict-aliasing target
+set(CONVERSIONHELPER_TARGET "${PROJECT_SOURCE_DIR}/src/utility/ConversionHelper.cpp")
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    message(STATUS "StoRM - Using Compiler Configuration: GCC")
+    # Set standard flags for GCC
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic")
+    # -Werror is atm removed as this gave some problems with existing code
+    # May be re-set later
+    # (Thomas Heinemann, 2012-12-21)
+    
+    # Turn on popcnt instruction if desired (yes by default)
+    if (USE_POPCNT)
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt")
+    endif(USE_POPCNT)
+	
+	# Set the no-strict-aliasing target for GCC
+	set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ")
+elseif(MSVC)
+    message(STATUS "StoRM - Using Compiler Configuration: MSVC")
+	# required for GMM to compile, ugly error directive in their code
+	add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS)
+	# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx)
+	add_definitions(/bigobj)
+	# required by GTest and PrismGrammar::createIntegerVariable
+	add_definitions(/D_VARIADIC_MAX=10)
+	# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max
+	add_definitions(/DNOMINMAX)
+	
+	if(ENABLE_Z3)
+		set(Z3_LIB_NAME "libz3")
+	endif()
+	
+	# MSVC does not do strict-aliasing, so no option needed
+else(CLANG)
+    message(STATUS "StoRM - Using Compiler Configuration: Clang (LLVM)")
+	# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler.
+	set (CLANG ON)
+    # Set standard flags for clang
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3")
+    if(UNIX AND NOT APPLE AND NOT USE_LIBCXX)
+		set(CLANG_STDLIB libstdc++)
+		message(STATUS "StoRM - Linking against libstdc++")
+    else()
+		set(CLANG_STDLIB libc++)
+		message(STATUS "StoRM - Linking against libc++")
+		# Disable Cotire
+		set(STORM_USE_COTIRE OFF)
+		# Set up some Xcode specific settings
+		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11")
+		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
+    endif()
+    
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024")
+    
+    set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+    
+    # Turn on popcnt instruction if desired (yes by default)
+    if (USE_POPCNT)
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt")
+    endif(USE_POPCNT)    
+	
+	# Set the no-strict-aliasing target for Clang
+	set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ")
+endif()
+
+#############################################################
+##
+##	CMake-generated Config File for StoRM
+##
+#############################################################
+# Base path for test files
+set(STORM_CPP_TESTS_BASE_PATH "${PROJECT_SOURCE_DIR}/test")
+# Gurobi Defines
+if (ENABLE_GUROBI)
+	set(STORM_CPP_GUROBI_DEF "define")
+else()
+	set(STORM_CPP_GUROBI_DEF "undef")
+endif()
+
+# glpk defines
+if (ENABLE_GLPK)
+	set(STORM_CPP_GLPK_DEF "define")
+else()
+	set(STORM_CPP_GLPK_DEF "undef")
+endif()
+
+# Z3 Defines
+if (ENABLE_Z3)
+	set(STORM_CPP_Z3_DEF "define")
+else()
+	set(STORM_CPP_Z3_DEF "undef")
+endif()
+
+# Intel TBB Defines
+if (TBB_FOUND AND ENABLE_INTELTBB)
+	set(STORM_CPP_INTELTBB_DEF "define")
+else()
+	set(STORM_CPP_INTELTBB_DEF "undef")
+endif()
+
+# Configure a header file to pass some of the CMake settings to the source code
+configure_file (
+	"${PROJECT_SOURCE_DIR}/../../storm-config.h.in"
+	"${PROJECT_BINARY_DIR}/include/storm-config.h"
+)
+# Add the binary dir include directory for storm-config.h
+include_directories("${PROJECT_BINARY_DIR}/include")
+
+# Add the main source directory for includes
+include_directories("${PROJECT_SOURCE_DIR}/../../src")
+
+#############################################################
+##
+##	Source file aggregation and clustering
+##
+#############################################################
+file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h)
+file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp)
+
+file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu")
+file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h")
+
+# Additional include files like the storm-config.h
+file(GLOB_RECURSE STORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h)
+
+# Group the headers and sources
+source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
+source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS})
+
+# Add custom additional include or link directories
+if (ADDITIONAL_INCLUDE_DIRS)
+	message(STATUS "StoRM - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}")
+	include_directories(${ADDITIONAL_INCLUDE_DIRS})
+endif(ADDITIONAL_INCLUDE_DIRS)
+if (ADDITIONAL_LINK_DIRS)
+	message(STATUS "StoRM - Using additional link directories ${ADDITIONAL_LINK_DIRS}")
+	link_directories(${ADDITIONAL_LINK_DIRS})
+endif(ADDITIONAL_LINK_DIRS)
+
+#############################################################
+##
+##	Pre executable-creation link_directories setup
+##
+#############################################################
+if (ENABLE_GUROBI)
+	link_directories("${GUROBI_ROOT}/lib")
+endif()
+if (ENABLE_Z3)
+    link_directories("${Z3_ROOT}/bin")
+endif()
+if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL ""))
+	set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib")
+endif ()
+link_directories(${Boost_LIBRARY_DIRS})
+if (TBB_FOUND AND ENABLE_INTELTBB)
+	link_directories(${TBB_LIBRARY_DIRS})
+endif()
+
+###############################################################################
+##                                                                            #
+##	Executable Creation                                                       #
+##                                                                            #
+##  All link_directories() calls MUST be made before this point               #
+##                                                                            #
+###############################################################################
+
+# Since this will be a library
+include (GenerateExportHeader)
+
+add_library(cudaForStorm STATIC ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
+GENERATE_EXPORT_HEADER( cudaForStorm
+             BASE_NAME cudaForStorm
+             EXPORT_MACRO_NAME cudaForStorm_EXPORT
+             EXPORT_FILE_NAME cudaForStorm_Export.h
+             STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
+)
+
+#############################################################
+##
+##	CUDA
+##
+#############################################################
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
+cuda_add_library(cudaLibrary
+  ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
+  OPTIONS -DSTUFF="" -arch=sm_30
+  RELEASE -DNDEBUG
+  DEBUG -g -DDEBUG
+)
+target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
+ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
+target_link_libraries(cudaForStorm cudaLibrary)
+message(STATUS "Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}")
+include_directories(${CUDA_INCLUDE_DIRS})
+
+#############################################################
+##
+##	Gurobi (optional)
+##
+#############################################################
+if (ENABLE_GUROBI)
+    message (STATUS "StoRM - Linking with Gurobi")
+	include_directories("${GUROBI_ROOT}/include")
+    target_link_libraries(cudaForStorm "gurobi56")
+endif(ENABLE_GUROBI)
+
+#############################################################
+##
+##	glpk (optional)
+##
+#############################################################
+if (ENABLE_GLPK)
+    message (STATUS "StoRM - Linking with glpk")
+    target_link_libraries(cudaForStorm "glpk")
+endif(ENABLE_GLPK)
+
+#############################################################
+##
+##	Z3 (optional)
+##
+#############################################################
+if (ENABLE_Z3)
+    message (STATUS "StoRM - Linking with Z3")
+	include_directories("${Z3_ROOT}/include")
+    target_link_libraries(cudaForStorm ${Z3_LIB_NAME})
+endif(ENABLE_Z3)
+
+#############################################################
+##
+##	Threads
+##
+#############################################################
+include_directories(${THREADS_INCLUDE_DIRS})
+target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT})
+
+if (MSVC)
+	# Add the DebugHelper DLL
+	set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib")
+	target_link_libraries(cudaForStorm "Dbghelp.lib")   
+endif(MSVC)
+
+# Link against libc++abi if requested. May be needed to build on Linux systems using clang.
+if (LINK_LIBCXXABI)
+	message (STATUS "StoRM - Linking against libc++abi.")
+	target_link_libraries(storm "c++abi")
+	target_link_libraries(storm-functional-tests "c++abi")
+	target_link_libraries(storm-performance-tests "c++abi")
+endif(LINK_LIBCXXABI)
--- a/resources/cudaForStorm/src/cudaTests.h
+++ b/resources/cudaForStorm/src/cudaTests.h
@ -0,0 +1,124 @@
+#include <cuda.h>
+#include "srcCuda/allCudaKernels.h"
+
+#include <iostream>
+#include <chrono>
+#include <random>
+
+void cudaShowDevices() {
+	// Todo
+}
+
+void cudaSimpleAddTest(int a, int b) {
+	std::cout << "Running cudaSimpleAddTest:" << std::endl;
+	std::cout << "a = " << a << ", b = " << b << "" << std::endl;
+
+	int c = cuda_basicAdd(a, b);
+	
+	std::cout << "Result: " << c << "" << std::endl;
+}
+
+void cudaArrayFmaTest(int N) {
+	std::cout << "Running cudaArrayFmaTest:" << std::endl;
+	std::cout << "N is " << N << ", resulting in " << (5 * sizeof(int) * N) << " Bytes of Data." << std::endl;
+
+	std::cout << "Generating random input arrays." << std::endl;
+
+	std::default_random_engine generator;
+	std::uniform_int_distribution<int> distribution(0, INT32_MAX);
+	int dice_roll = distribution(generator);
+
+	auto start_time = std::chrono::high_resolution_clock::now();
+
+	int* arrayA = new int[N];
+	int* arrayB = new int[N];
+	int* arrayC = new int[N];
+	int* arrayD = new int[N];
+	int* arrayD_CPU = new int[N];
+
+	for (int i = 0; i < N; ++i) {
+		//arrayA[i] = distribution(generator);
+		//arrayB[i] = distribution(generator);
+		//arrayC[i] = distribution(generator);
+		arrayA[i] = i * 1000 + 137;
+		arrayB[i] = i * 7000 + 1537;
+		arrayC[i] = i * 15000 + 97;
+		arrayD[i] = 0;
+		arrayD_CPU[i] = 0;
+	}
+
+	auto end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	std::cout << "Running FMA test on CPU." << std::endl;
+
+	start_time = std::chrono::high_resolution_clock::now();	
+	cuda_arrayFmaHelper(arrayA, arrayB, arrayC, arrayD_CPU, N);
+	end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	start_time = std::chrono::high_resolution_clock::now();
+	cuda_arrayFma(arrayA, arrayB, arrayC, arrayD, N);
+	end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	int errors = 0;
+	for (int i = 0; i < N; ++i) {
+		if (arrayD[i] != arrayD_CPU[i]) {
+			std::cout << "Error in Entry " << i << ": GPU has " << arrayD[i] << " but CPU has " << arrayD_CPU[i] << "!" << std::endl;
+			++errors;
+		}
+	}
+	std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl;
+}
+
+void cudaArrayFmaOptimizedTest(int N, int M) {
+	std::cout << "Running cudaArrayFmaTest:" << std::endl;
+	std::cout << "N is " << N << ", resulting in " << (4 * sizeof(int) * N) << " Bytes of Data." << std::endl;
+
+	size_t freeCudaMemory = getFreeCudaMemory();
+	size_t totalCudaMemory = getTotalCudaMemory();
+	int freeProzent = static_cast<int>(((double)freeCudaMemory)/((double)totalCudaMemory) * 100);
+
+	std::cout << "CUDA Device has " << freeCudaMemory << " Bytes of " << totalCudaMemory << " Bytes free (" << (freeProzent) << "%)." << std::endl;
+
+	std::cout << "Generating random input arrays." << std::endl;
+
+	std::default_random_engine generator;
+	std::uniform_int_distribution<int> distribution(0, INT32_MAX);
+
+	auto start_time = std::chrono::high_resolution_clock::now();
+
+	int* arrayA = new int[4 * N];
+	int* arrayA_CPU = new int[4 * N];
+
+	for (int i = 0; i < 4*N; ++i) {
+		arrayA[i] = i * 1000 + i + (357854878 % (i+1));
+		arrayA_CPU[i] = arrayA[i];
+	}
+
+	auto end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	start_time = std::chrono::high_resolution_clock::now();	
+	cuda_arrayFmaOptimizedHelper(arrayA_CPU, N);
+	end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	start_time = std::chrono::high_resolution_clock::now();
+	cuda_arrayFmaOptimized(arrayA, N, M);
+	end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+
+	int errors = 0;
+	for (int i = 0; i < N; i+=4) {
+		if (arrayA[i+3] != arrayA_CPU[i+3]) {
+			//std::cout << "Error in Entry " << i << ": GPU has " << arrayA[i+3] << " but CPU has " << arrayA_CPU[i+3] << "!" << std::endl;
+			++errors;
+		}
+	}
+	std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl;
+
+	delete[] arrayA;
+	delete[] arrayA_CPU;
+}
--- a/resources/cudaForStorm/src/main.cpp
+++ b/resources/cudaForStorm/src/main.cpp
@ -0,0 +1,62 @@
+#include <stdio.h>  
+#include <stdlib.h>
+
+#include <iostream>
+#include <chrono>
+#include <random>
+
+#include "cudaTests.h"
+
+int main(int argc, char **argv){
+	resetCudaDevice();
+
+	int testNumber = 0;
+	int N = 10000;
+	int M = 402653184;
+	if (argc > 1) {
+		testNumber = atoi(argv[1]);
+		if (argc > 2) {
+			N = atoi(argv[2]);
+			if (argc > 3) {
+				M = atoi(argv[3]);
+			}
+		}
+	}
+
+	switch (testNumber) {
+		case 1:
+			cudaSimpleAddTest(N, M);
+			break;
+		case 2:
+			cudaArrayFmaTest(N);
+			break;
+		case 3:
+			cudaArrayFmaOptimizedTest(N, M);
+			break;
+		case 4:
+			cpp_cuda_bandwidthTest(M, N);
+			break;
+		case 5:
+			kernelSwitchTest(N);
+			break;
+			break;
+		// DEFAULT AND 0
+		case 0:
+		default:
+			std::cout << "Available functions are:" << std::endl;
+			std::cout << "0 - Show this  overview" << std::endl;
+			std::cout << "1 - cuda   simpleAddTest(N, M)" << std::endl;
+			std::cout << "2 - cuda   arrayFmaTest(N)" << std::endl;
+			std::cout << "3 - cuda   arrayFmaOptimizedTest(N, M)" << std::endl;
+			std::cout << "4 - cuda   bandwidthTest(M, N)" << std::endl;
+			std::cout << "5 - cuda   kernelSwitchTest(N)" << std::endl;
+			std::cout << std::endl;
+			std::cout << "Call: " << argv[0] << " Selection [N [M]]" << std::endl;
+			std::cout << "Defaults:" <<std::endl;
+			std::cout << "N: 10000" << std::endl;
+			std::cout << "M: 402653184" << std::endl;
+			break;
+	}
+
+    return 0;
+}
--- a/resources/cudaForStorm/srcCuda/allCudaKernels.h
+++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h
@ -0,0 +1,4 @@
+#include "utility.h"
+#include "bandWidth.h"
+#include "basicAdd.h"
+#include "kernelSwitchTest.h"
--- a/resources/cudaForStorm/srcCuda/bandWidth.cu
+++ b/resources/cudaForStorm/srcCuda/bandWidth.cu
--- a/resources/cudaForStorm/srcCuda/bandWidth.h
+++ b/resources/cudaForStorm/srcCuda/bandWidth.h
--- a/resources/cudaForStorm/srcCuda/basicAdd.cu
+++ b/resources/cudaForStorm/srcCuda/basicAdd.cu
@ -0,0 +1,286 @@
+#include <cuda.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <chrono>
+#include <iostream>
+
+__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { 
+	*c = a + b; 
+}
+
+__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	// Fused Multiply Add:
+	// A * B + C => D
+
+	/*
+     *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd
+     *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche
+     *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente
+     *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des
+     *Threads, der die Funktion gerade ausführt.
+    */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (i < N) {
+		D[i] = A[i] * B[i] + C[i];
+	}
+}
+
+__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) {
+	// Fused Multiply Add:
+	// A * B + C => D
+
+	// Layout:
+	// A B C D A B C D A B C D
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if ((i*M) < N) {
+		for (int j = i*M; j < i*M + M; ++j) {
+			A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2];
+		}
+	}
+}
+
+extern "C" int cuda_basicAdd(int a, int b) {
+	int c = 0;
+	int *dev_c;
+	cudaMalloc((void**)&dev_c, sizeof(int));
+	cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c);
+	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
+	//printf("%d + %d + 42 is %d\n", a, b, c);
+	cudaFree(dev_c);
+	return c;
+}
+
+void cpp_cuda_bandwidthTest(int entryCount, int N) {
+	// Size of the Arrays
+	size_t arraySize = entryCount * sizeof(int);
+	
+	int* deviceIntArray;
+	int* hostIntArray = new int[arraySize];
+
+	// Allocate space on the device
+	auto start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
+			std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl;
+			delete[] hostIntArray;
+			return;
+		}
+		// Free memory on device
+		if (cudaFree(deviceIntArray) != cudaSuccess) {
+			std::cout << "Error in cudaFree!" << std::endl;
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	auto end_time = std::chrono::high_resolution_clock::now();
+	auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl;
+
+	if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl;
+		delete[] hostIntArray;
+		return;
+	}
+	
+	// Prepare data
+	for (int i = 0; i < N; ++i) {
+		hostIntArray[i] = i * 333 + 123;
+	}
+
+	// Copy data TO device
+	start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl;
+			// Free memory on device
+			if (cudaFree(deviceIntArray) != cudaSuccess) {
+				std::cout << "Error in cudaFree!" << std::endl;
+			}
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	end_time = std::chrono::high_resolution_clock::now();
+	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl;
+
+	// Copy data FROM device
+	start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl;
+			// Free memory on device
+			if (cudaFree(deviceIntArray) != cudaSuccess) {
+				std::cout << "Error in cudaFree!" << std::endl;
+			}
+			delete[] hostIntArray;
+			return;
+		}
+	}
+	end_time = std::chrono::high_resolution_clock::now();
+	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625;
+	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl;
+	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl;
+
+	// Free memory on device
+	if (cudaFree(deviceIntArray) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+	}
+	delete[] hostIntArray;
+}
+
+extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	// Size of the Arrays
+	size_t arraySize = N * sizeof(int);
+	
+	int* deviceIntArrayA;
+	int* deviceIntArrayB;
+	int* deviceIntArrayC;
+	int* deviceIntArrayD;
+
+	// Allocate space on the device
+	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc1!\n");
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc2!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc3!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc4!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		return;
+	}
+	
+	// Copy data TO device
+	if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+	
+    // Festlegung der Threads pro Block
+    int threadsPerBlock = 512;
+    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
+    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+	// Run kernel
+	cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N);
+
+	// Copy data FROM device
+	if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		cudaFree(deviceIntArrayB);
+		cudaFree(deviceIntArrayC);
+		cudaFree(deviceIntArrayD);
+		return;
+	}
+
+	// Free memory on device
+	cudaFree(deviceIntArrayA);
+	cudaFree(deviceIntArrayB);
+	cudaFree(deviceIntArrayC);
+	cudaFree(deviceIntArrayD);
+}
+
+extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) {
+	// Size of the Arrays
+	size_t arraySize = N * sizeof(int) * 4;
+	
+	int* deviceIntArrayA;
+
+	// Allocate space on the device
+	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) {
+		printf("Error in cudaMalloc1!\n");
+		return;
+	}
+
+#define ONFAILFREE0() do { } while(0)
+#define ONFAILFREE1(a) do { cudaFree(a); } while(0)
+#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0)
+#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0)
+#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0)
+#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0)
+
+	// Copy data TO device
+
+	CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice);
+
+	/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}*/
+	
+    // Festlegung der Threads pro Block
+    int threadsPerBlock = 512;
+    // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können
+    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+	// Run kernel
+	cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M);
+
+	// Copy data FROM device
+	if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) {
+		printf("Error in cudaMemcpy!\n");
+		cudaFree(deviceIntArrayA);
+		return;
+	}
+
+	// Free memory on device
+	if (cudaFree(deviceIntArrayA) != cudaSuccess) {
+		printf("Error in cudaFree!\n");
+		return;
+	}
+}
+
+extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) {
+	for (int i = 0; i < N; ++i) {
+		D[i] = A[i] * B[i] + C[i];
+	}
+}
+
+extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) {
+	for (int i = 0; i < N; i += 4) {
+		A[i+3] = A[i] * A[i+1] + A[i+2];
+	}
+}
--- a/resources/cudaForStorm/srcCuda/basicAdd.h
+++ b/resources/cudaForStorm/srcCuda/basicAdd.h
@ -0,0 +1,9 @@
+extern "C" int cuda_basicAdd(int a, int b);
+
+extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M);
+extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N);
+
+extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N);
+extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N);
+
+void cpp_cuda_bandwidthTest(int entryCount, int N);
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
@ -0,0 +1,39 @@
+#include <iostream>
+#include <chrono>
+
+__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) {
+	*B = *A;
+}
+
+void kernelSwitchTest(size_t N) {
+	int* deviceIntA;
+	int* deviceIntB;
+
+	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
+		return;
+	}
+	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
+		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
+		return;
+	}
+
+	// Allocate space on the device
+	auto start_time = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < N; ++i) {
+		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
+	}
+	auto end_time = std::chrono::high_resolution_clock::now();
+	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
+	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
+
+	// Free memory on device
+	if (cudaFree(deviceIntA) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+		return;
+	}
+	if (cudaFree(deviceIntB) != cudaSuccess) {
+		std::cout << "Error in cudaFree!" << std::endl;
+		return;
+	}
+}
--- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
+++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h
@ -0,0 +1 @@
+void kernelSwitchTest(size_t N);
--- a/resources/cudaForStorm/srcCuda/utility.cu
+++ b/resources/cudaForStorm/srcCuda/utility.cu
@ -0,0 +1,19 @@
+size_t getFreeCudaMemory() {
+	size_t freeMemory;
+	size_t totalMemory;
+	cudaMemGetInfo(&freeMemory, &totalMemory);
+
+	return freeMemory;
+}
+
+size_t getTotalCudaMemory() {
+	size_t freeMemory;
+	size_t totalMemory;
+	cudaMemGetInfo(&freeMemory, &totalMemory);
+
+	return totalMemory;
+}
+
+void resetCudaDevice() {
+	cudaDeviceReset();
+}
--- a/resources/cudaForStorm/srcCuda/utility.h
+++ b/resources/cudaForStorm/srcCuda/utility.h
@ -0,0 +1,3 @@
+size_t getFreeCudaMemory();
+size_t getTotalCudaMemory();
+void resetCudaDevice();