12 changed files with 881 additions and 0 deletions
			
			
		- 
					334resources/cudaForStorm/CMakeLists.txt
- 
					124resources/cudaForStorm/src/cudaTests.h
- 
					62resources/cudaForStorm/src/main.cpp
- 
					4resources/cudaForStorm/srcCuda/allCudaKernels.h
- 
					0resources/cudaForStorm/srcCuda/bandWidth.cu
- 
					0resources/cudaForStorm/srcCuda/bandWidth.h
- 
					286resources/cudaForStorm/srcCuda/basicAdd.cu
- 
					9resources/cudaForStorm/srcCuda/basicAdd.h
- 
					39resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
- 
					1resources/cudaForStorm/srcCuda/kernelSwitchTest.h
- 
					19resources/cudaForStorm/srcCuda/utility.cu
- 
					3resources/cudaForStorm/srcCuda/utility.h
| @ -0,0 +1,334 @@ | |||||
|  | cmake_minimum_required (VERSION 2.8.6) | ||||
|  | 
 | ||||
|  | # Set project name | ||||
|  | project (cudaForStorm CXX C) | ||||
|  | 
 | ||||
|  | # Set the version number | ||||
|  | set (STORM_CPP_VERSION_MAJOR 1) | ||||
|  | set (STORM_CPP_VERSION_MINOR 0) | ||||
|  | 
 | ||||
|  | # Add base folder for better inclusion paths | ||||
|  | include_directories("${PROJECT_SOURCE_DIR}") | ||||
|  | include_directories("${PROJECT_SOURCE_DIR}/src") | ||||
|  | 
 | ||||
|  | message(STATUS "CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	CMake options of StoRM | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | option(DEBUG "Sets whether the DEBUG mode is used" ON) | ||||
|  | option(USE_POPCNT "Sets whether the popcnt instruction is going to be used." ON) | ||||
|  | option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) | ||||
|  | option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) | ||||
|  | option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF) | ||||
|  | set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") | ||||
|  | set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") | ||||
|  | set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") | ||||
|  | set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Inclusion of required libraries | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | 
 | ||||
|  | # Add the resources/cmake folder to Module Search Path for FindTBB.cmake | ||||
|  | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") | ||||
|  | 
 | ||||
|  | find_package(CUDA REQUIRED) | ||||
|  | find_package(Doxygen REQUIRED) | ||||
|  | find_package(Threads REQUIRED) | ||||
|  | 
 | ||||
|  | # If the DEBUG option was turned on, we will target a debug version and a release version otherwise | ||||
|  | if (DEBUG) | ||||
|  |     set (CMAKE_BUILD_TYPE "DEBUG") | ||||
|  | else() | ||||
|  |     set (CMAKE_BUILD_TYPE "RELEASE") | ||||
|  | endif() | ||||
|  | message(STATUS "StoRM - Building ${CMAKE_BUILD_TYPE} version.") | ||||
|  | 
 | ||||
|  | if ("${GUROBI_ROOT}" STREQUAL "") | ||||
|  |     set(ENABLE_GUROBI OFF) | ||||
|  | else() | ||||
|  |     set(ENABLE_GUROBI ON) | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | if ("${Z3_ROOT}" STREQUAL "") | ||||
|  |     set(ENABLE_Z3 OFF) | ||||
|  | else() | ||||
|  |     set(ENABLE_Z3 ON) | ||||
|  | 	set(Z3_LIB_NAME "z3") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | message(STATUS "StoRM - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") | ||||
|  | message(STATUS "StoRM - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	CUDA Options | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) | ||||
|  | set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) | ||||
|  | set(BUILD_SHARED_LIBS OFF) | ||||
|  | set(CUDA_SEPARABLE_COMPILATION ON) | ||||
|  | #set(CUDA_NVCC_FLAGS "-arch=sm_30") | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Compiler specific settings and definitions | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | 
 | ||||
|  | # Path to the no-strict-aliasing target | ||||
|  | set(CONVERSIONHELPER_TARGET "${PROJECT_SOURCE_DIR}/src/utility/ConversionHelper.cpp") | ||||
|  | 
 | ||||
|  | if(CMAKE_COMPILER_IS_GNUCC) | ||||
|  |     message(STATUS "StoRM - Using Compiler Configuration: GCC") | ||||
|  |     # Set standard flags for GCC | ||||
|  |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") | ||||
|  |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") | ||||
|  |     # -Werror is atm removed as this gave some problems with existing code | ||||
|  |     # May be re-set later | ||||
|  |     # (Thomas Heinemann, 2012-12-21) | ||||
|  |      | ||||
|  |     # Turn on popcnt instruction if desired (yes by default) | ||||
|  |     if (USE_POPCNT) | ||||
|  |         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") | ||||
|  |     endif(USE_POPCNT) | ||||
|  | 	 | ||||
|  | 	# Set the no-strict-aliasing target for GCC | ||||
|  | 	set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") | ||||
|  | elseif(MSVC) | ||||
|  |     message(STATUS "StoRM - Using Compiler Configuration: MSVC") | ||||
|  | 	# required for GMM to compile, ugly error directive in their code | ||||
|  | 	add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) | ||||
|  | 	# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) | ||||
|  | 	add_definitions(/bigobj) | ||||
|  | 	# required by GTest and PrismGrammar::createIntegerVariable | ||||
|  | 	add_definitions(/D_VARIADIC_MAX=10) | ||||
|  | 	# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max | ||||
|  | 	add_definitions(/DNOMINMAX) | ||||
|  | 	 | ||||
|  | 	if(ENABLE_Z3) | ||||
|  | 		set(Z3_LIB_NAME "libz3") | ||||
|  | 	endif() | ||||
|  | 	 | ||||
|  | 	# MSVC does not do strict-aliasing, so no option needed | ||||
|  | else(CLANG) | ||||
|  |     message(STATUS "StoRM - Using Compiler Configuration: Clang (LLVM)") | ||||
|  | 	# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. | ||||
|  | 	set (CLANG ON) | ||||
|  |     # Set standard flags for clang | ||||
|  |     set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") | ||||
|  |     if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) | ||||
|  | 		set(CLANG_STDLIB libstdc++) | ||||
|  | 		message(STATUS "StoRM - Linking against libstdc++") | ||||
|  |     else() | ||||
|  | 		set(CLANG_STDLIB libc++) | ||||
|  | 		message(STATUS "StoRM - Linking against libc++") | ||||
|  | 		# Disable Cotire | ||||
|  | 		set(STORM_USE_COTIRE OFF) | ||||
|  | 		# Set up some Xcode specific settings | ||||
|  | 		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") | ||||
|  | 		set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") | ||||
|  |     endif() | ||||
|  |      | ||||
|  |     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") | ||||
|  |      | ||||
|  |     set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") | ||||
|  |      | ||||
|  |     # Turn on popcnt instruction if desired (yes by default) | ||||
|  |     if (USE_POPCNT) | ||||
|  |         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") | ||||
|  |     endif(USE_POPCNT)     | ||||
|  | 	 | ||||
|  | 	# Set the no-strict-aliasing target for Clang | ||||
|  | 	set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	CMake-generated Config File for StoRM | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | # Base path for test files | ||||
|  | set(STORM_CPP_TESTS_BASE_PATH "${PROJECT_SOURCE_DIR}/test") | ||||
|  | # Gurobi Defines | ||||
|  | if (ENABLE_GUROBI) | ||||
|  | 	set(STORM_CPP_GUROBI_DEF "define") | ||||
|  | else() | ||||
|  | 	set(STORM_CPP_GUROBI_DEF "undef") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | # glpk defines | ||||
|  | if (ENABLE_GLPK) | ||||
|  | 	set(STORM_CPP_GLPK_DEF "define") | ||||
|  | else() | ||||
|  | 	set(STORM_CPP_GLPK_DEF "undef") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | # Z3 Defines | ||||
|  | if (ENABLE_Z3) | ||||
|  | 	set(STORM_CPP_Z3_DEF "define") | ||||
|  | else() | ||||
|  | 	set(STORM_CPP_Z3_DEF "undef") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | # Intel TBB Defines | ||||
|  | if (TBB_FOUND AND ENABLE_INTELTBB) | ||||
|  | 	set(STORM_CPP_INTELTBB_DEF "define") | ||||
|  | else() | ||||
|  | 	set(STORM_CPP_INTELTBB_DEF "undef") | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | # Configure a header file to pass some of the CMake settings to the source code | ||||
|  | configure_file ( | ||||
|  | 	"${PROJECT_SOURCE_DIR}/../../storm-config.h.in" | ||||
|  | 	"${PROJECT_BINARY_DIR}/include/storm-config.h" | ||||
|  | ) | ||||
|  | # Add the binary dir include directory for storm-config.h | ||||
|  | include_directories("${PROJECT_BINARY_DIR}/include") | ||||
|  | 
 | ||||
|  | # Add the main source directory for includes | ||||
|  | include_directories("${PROJECT_SOURCE_DIR}/../../src") | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Source file aggregation and clustering | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) | ||||
|  | file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) | ||||
|  | 
 | ||||
|  | file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") | ||||
|  | file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") | ||||
|  | 
 | ||||
|  | # Additional include files like the storm-config.h | ||||
|  | file(GLOB_RECURSE STORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) | ||||
|  | 
 | ||||
|  | # Group the headers and sources | ||||
|  | source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) | ||||
|  | source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) | ||||
|  | 
 | ||||
|  | # Add custom additional include or link directories | ||||
|  | if (ADDITIONAL_INCLUDE_DIRS) | ||||
|  | 	message(STATUS "StoRM - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") | ||||
|  | 	include_directories(${ADDITIONAL_INCLUDE_DIRS}) | ||||
|  | endif(ADDITIONAL_INCLUDE_DIRS) | ||||
|  | if (ADDITIONAL_LINK_DIRS) | ||||
|  | 	message(STATUS "StoRM - Using additional link directories ${ADDITIONAL_LINK_DIRS}") | ||||
|  | 	link_directories(${ADDITIONAL_LINK_DIRS}) | ||||
|  | endif(ADDITIONAL_LINK_DIRS) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Pre executable-creation link_directories setup | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | if (ENABLE_GUROBI) | ||||
|  | 	link_directories("${GUROBI_ROOT}/lib") | ||||
|  | endif() | ||||
|  | if (ENABLE_Z3) | ||||
|  |     link_directories("${Z3_ROOT}/bin") | ||||
|  | endif() | ||||
|  | if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) | ||||
|  | 	set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") | ||||
|  | endif () | ||||
|  | link_directories(${Boost_LIBRARY_DIRS}) | ||||
|  | if (TBB_FOUND AND ENABLE_INTELTBB) | ||||
|  | 	link_directories(${TBB_LIBRARY_DIRS}) | ||||
|  | endif() | ||||
|  | 
 | ||||
|  | ############################################################################### | ||||
|  | ##                                                                            # | ||||
|  | ##	Executable Creation                                                       # | ||||
|  | ##                                                                            # | ||||
|  | ##  All link_directories() calls MUST be made before this point               # | ||||
|  | ##                                                                            # | ||||
|  | ############################################################################### | ||||
|  | 
 | ||||
|  | # Since this will be a library | ||||
|  | include (GenerateExportHeader) | ||||
|  | 
 | ||||
|  | add_library(cudaForStorm STATIC ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) | ||||
|  | GENERATE_EXPORT_HEADER( cudaForStorm | ||||
|  |              BASE_NAME cudaForStorm | ||||
|  |              EXPORT_MACRO_NAME cudaForStorm_EXPORT | ||||
|  |              EXPORT_FILE_NAME cudaForStorm_Export.h | ||||
|  |              STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC | ||||
|  | ) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	CUDA | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) | ||||
|  | cuda_add_library(cudaLibrary | ||||
|  |   ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} | ||||
|  |   OPTIONS -DSTUFF="" -arch=sm_30 | ||||
|  |   RELEASE -DNDEBUG | ||||
|  |   DEBUG -g -DDEBUG | ||||
|  | ) | ||||
|  | target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) | ||||
|  | ADD_DEPENDENCIES(cudaForStorm cudaLibrary) | ||||
|  | target_link_libraries(cudaForStorm cudaLibrary) | ||||
|  | message(STATUS "Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") | ||||
|  | include_directories(${CUDA_INCLUDE_DIRS}) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Gurobi (optional) | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | if (ENABLE_GUROBI) | ||||
|  |     message (STATUS "StoRM - Linking with Gurobi") | ||||
|  | 	include_directories("${GUROBI_ROOT}/include") | ||||
|  |     target_link_libraries(cudaForStorm "gurobi56") | ||||
|  | endif(ENABLE_GUROBI) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	glpk (optional) | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | if (ENABLE_GLPK) | ||||
|  |     message (STATUS "StoRM - Linking with glpk") | ||||
|  |     target_link_libraries(cudaForStorm "glpk") | ||||
|  | endif(ENABLE_GLPK) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Z3 (optional) | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | if (ENABLE_Z3) | ||||
|  |     message (STATUS "StoRM - Linking with Z3") | ||||
|  | 	include_directories("${Z3_ROOT}/include") | ||||
|  |     target_link_libraries(cudaForStorm ${Z3_LIB_NAME}) | ||||
|  | endif(ENABLE_Z3) | ||||
|  | 
 | ||||
|  | ############################################################# | ||||
|  | ## | ||||
|  | ##	Threads | ||||
|  | ## | ||||
|  | ############################################################# | ||||
|  | include_directories(${THREADS_INCLUDE_DIRS}) | ||||
|  | target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT}) | ||||
|  | 
 | ||||
|  | if (MSVC) | ||||
|  | 	# Add the DebugHelper DLL | ||||
|  | 	set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") | ||||
|  | 	target_link_libraries(cudaForStorm "Dbghelp.lib")    | ||||
|  | endif(MSVC) | ||||
|  | 
 | ||||
|  | # Link against libc++abi if requested. May be needed to build on Linux systems using clang. | ||||
|  | if (LINK_LIBCXXABI) | ||||
|  | 	message (STATUS "StoRM - Linking against libc++abi.") | ||||
|  | 	target_link_libraries(storm "c++abi") | ||||
|  | 	target_link_libraries(storm-functional-tests "c++abi") | ||||
|  | 	target_link_libraries(storm-performance-tests "c++abi") | ||||
|  | endif(LINK_LIBCXXABI) | ||||
| @ -0,0 +1,124 @@ | |||||
|  | #include <cuda.h> | ||||
|  | #include "srcCuda/allCudaKernels.h" | ||||
|  | 
 | ||||
|  | #include <iostream> | ||||
|  | #include <chrono> | ||||
|  | #include <random> | ||||
|  | 
 | ||||
|  | void cudaShowDevices() { | ||||
|  | 	// Todo | ||||
|  | } | ||||
|  | 
 | ||||
|  | void cudaSimpleAddTest(int a, int b) { | ||||
|  | 	std::cout << "Running cudaSimpleAddTest:" << std::endl; | ||||
|  | 	std::cout << "a = " << a << ", b = " << b << "" << std::endl; | ||||
|  | 
 | ||||
|  | 	int c = cuda_basicAdd(a, b); | ||||
|  | 	 | ||||
|  | 	std::cout << "Result: " << c << "" << std::endl; | ||||
|  | } | ||||
|  | 
 | ||||
|  | void cudaArrayFmaTest(int N) { | ||||
|  | 	std::cout << "Running cudaArrayFmaTest:" << std::endl; | ||||
|  | 	std::cout << "N is " << N << ", resulting in " << (5 * sizeof(int) * N) << " Bytes of Data." << std::endl; | ||||
|  | 
 | ||||
|  | 	std::cout << "Generating random input arrays." << std::endl; | ||||
|  | 
 | ||||
|  | 	std::default_random_engine generator; | ||||
|  | 	std::uniform_int_distribution<int> distribution(0, INT32_MAX); | ||||
|  | 	int dice_roll = distribution(generator); | ||||
|  | 
 | ||||
|  | 	auto start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 
 | ||||
|  | 	int* arrayA = new int[N]; | ||||
|  | 	int* arrayB = new int[N]; | ||||
|  | 	int* arrayC = new int[N]; | ||||
|  | 	int* arrayD = new int[N]; | ||||
|  | 	int* arrayD_CPU = new int[N]; | ||||
|  | 
 | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		//arrayA[i] = distribution(generator); | ||||
|  | 		//arrayB[i] = distribution(generator); | ||||
|  | 		//arrayC[i] = distribution(generator); | ||||
|  | 		arrayA[i] = i * 1000 + 137; | ||||
|  | 		arrayB[i] = i * 7000 + 1537; | ||||
|  | 		arrayC[i] = i * 15000 + 97; | ||||
|  | 		arrayD[i] = 0; | ||||
|  | 		arrayD_CPU[i] = 0; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	auto end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	std::cout << "Running FMA test on CPU." << std::endl; | ||||
|  | 
 | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now();	 | ||||
|  | 	cuda_arrayFmaHelper(arrayA, arrayB, arrayC, arrayD_CPU, N); | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	cuda_arrayFma(arrayA, arrayB, arrayC, arrayD, N); | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	int errors = 0; | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		if (arrayD[i] != arrayD_CPU[i]) { | ||||
|  | 			std::cout << "Error in Entry " << i << ": GPU has " << arrayD[i] << " but CPU has " << arrayD_CPU[i] << "!" << std::endl; | ||||
|  | 			++errors; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 	std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; | ||||
|  | } | ||||
|  | 
 | ||||
|  | void cudaArrayFmaOptimizedTest(int N, int M) { | ||||
|  | 	std::cout << "Running cudaArrayFmaTest:" << std::endl; | ||||
|  | 	std::cout << "N is " << N << ", resulting in " << (4 * sizeof(int) * N) << " Bytes of Data." << std::endl; | ||||
|  | 
 | ||||
|  | 	size_t freeCudaMemory = getFreeCudaMemory(); | ||||
|  | 	size_t totalCudaMemory = getTotalCudaMemory(); | ||||
|  | 	int freeProzent = static_cast<int>(((double)freeCudaMemory)/((double)totalCudaMemory) * 100); | ||||
|  | 
 | ||||
|  | 	std::cout << "CUDA Device has " << freeCudaMemory << " Bytes of " << totalCudaMemory << " Bytes free (" << (freeProzent) << "%)." << std::endl; | ||||
|  | 
 | ||||
|  | 	std::cout << "Generating random input arrays." << std::endl; | ||||
|  | 
 | ||||
|  | 	std::default_random_engine generator; | ||||
|  | 	std::uniform_int_distribution<int> distribution(0, INT32_MAX); | ||||
|  | 
 | ||||
|  | 	auto start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 
 | ||||
|  | 	int* arrayA = new int[4 * N]; | ||||
|  | 	int* arrayA_CPU = new int[4 * N]; | ||||
|  | 
 | ||||
|  | 	for (int i = 0; i < 4*N; ++i) { | ||||
|  | 		arrayA[i] = i * 1000 + i + (357854878 % (i+1)); | ||||
|  | 		arrayA_CPU[i] = arrayA[i]; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	auto end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now();	 | ||||
|  | 	cuda_arrayFmaOptimizedHelper(arrayA_CPU, N); | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	cuda_arrayFmaOptimized(arrayA, N, M); | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 
 | ||||
|  | 	int errors = 0; | ||||
|  | 	for (int i = 0; i < N; i+=4) { | ||||
|  | 		if (arrayA[i+3] != arrayA_CPU[i+3]) { | ||||
|  | 			//std::cout << "Error in Entry " << i << ": GPU has " << arrayA[i+3] << " but CPU has " << arrayA_CPU[i+3] << "!" << std::endl; | ||||
|  | 			++errors; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 	std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; | ||||
|  | 
 | ||||
|  | 	delete[] arrayA; | ||||
|  | 	delete[] arrayA_CPU; | ||||
|  | } | ||||
| @ -0,0 +1,62 @@ | |||||
|  | #include <stdio.h>  
 | ||||
|  | #include <stdlib.h>
 | ||||
|  | 
 | ||||
|  | #include <iostream>
 | ||||
|  | #include <chrono>
 | ||||
|  | #include <random>
 | ||||
|  | 
 | ||||
|  | #include "cudaTests.h"
 | ||||
|  | 
 | ||||
|  | int main(int argc, char **argv){ | ||||
|  | 	resetCudaDevice(); | ||||
|  | 
 | ||||
|  | 	int testNumber = 0; | ||||
|  | 	int N = 10000; | ||||
|  | 	int M = 402653184; | ||||
|  | 	if (argc > 1) { | ||||
|  | 		testNumber = atoi(argv[1]); | ||||
|  | 		if (argc > 2) { | ||||
|  | 			N = atoi(argv[2]); | ||||
|  | 			if (argc > 3) { | ||||
|  | 				M = atoi(argv[3]); | ||||
|  | 			} | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	switch (testNumber) { | ||||
|  | 		case 1: | ||||
|  | 			cudaSimpleAddTest(N, M); | ||||
|  | 			break; | ||||
|  | 		case 2: | ||||
|  | 			cudaArrayFmaTest(N); | ||||
|  | 			break; | ||||
|  | 		case 3: | ||||
|  | 			cudaArrayFmaOptimizedTest(N, M); | ||||
|  | 			break; | ||||
|  | 		case 4: | ||||
|  | 			cpp_cuda_bandwidthTest(M, N); | ||||
|  | 			break; | ||||
|  | 		case 5: | ||||
|  | 			kernelSwitchTest(N); | ||||
|  | 			break; | ||||
|  | 			break; | ||||
|  | 		// DEFAULT AND 0
 | ||||
|  | 		case 0: | ||||
|  | 		default: | ||||
|  | 			std::cout << "Available functions are:" << std::endl; | ||||
|  | 			std::cout << "0 - Show this  overview" << std::endl; | ||||
|  | 			std::cout << "1 - cuda   simpleAddTest(N, M)" << std::endl; | ||||
|  | 			std::cout << "2 - cuda   arrayFmaTest(N)" << std::endl; | ||||
|  | 			std::cout << "3 - cuda   arrayFmaOptimizedTest(N, M)" << std::endl; | ||||
|  | 			std::cout << "4 - cuda   bandwidthTest(M, N)" << std::endl; | ||||
|  | 			std::cout << "5 - cuda   kernelSwitchTest(N)" << std::endl; | ||||
|  | 			std::cout << std::endl; | ||||
|  | 			std::cout << "Call: " << argv[0] << " Selection [N [M]]" << std::endl; | ||||
|  | 			std::cout << "Defaults:" <<std::endl; | ||||
|  | 			std::cout << "N: 10000" << std::endl; | ||||
|  | 			std::cout << "M: 402653184" << std::endl; | ||||
|  | 			break; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  |     return 0; | ||||
|  | } | ||||
| @ -0,0 +1,4 @@ | |||||
|  | #include "utility.h" | ||||
|  | #include "bandWidth.h" | ||||
|  | #include "basicAdd.h" | ||||
|  | #include "kernelSwitchTest.h" | ||||
| @ -0,0 +1,286 @@ | |||||
|  | #include <cuda.h> | ||||
|  | #include <stdlib.h> | ||||
|  | #include <stdio.h> | ||||
|  | 
 | ||||
|  | #include <chrono> | ||||
|  | #include <iostream> | ||||
|  | 
 | ||||
|  | __global__ void cuda_kernel_basicAdd(int a, int b, int *c) {  | ||||
|  | 	*c = a + b;  | ||||
|  | } | ||||
|  | 
 | ||||
|  | __global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { | ||||
|  | 	// Fused Multiply Add: | ||||
|  | 	// A * B + C => D | ||||
|  | 
 | ||||
|  | 	/* | ||||
|  |      *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd | ||||
|  |      *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche | ||||
|  |      *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente | ||||
|  |      *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des | ||||
|  |      *Threads, der die Funktion gerade ausführt. | ||||
|  |     */ | ||||
|  |     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|  | 
 | ||||
|  | 	if (i < N) { | ||||
|  | 		D[i] = A[i] * B[i] + C[i]; | ||||
|  | 	} | ||||
|  | } | ||||
|  | 
 | ||||
|  | __global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { | ||||
|  | 	// Fused Multiply Add: | ||||
|  | 	// A * B + C => D | ||||
|  | 
 | ||||
|  | 	// Layout: | ||||
|  | 	// A B C D A B C D A B C D | ||||
|  | 
 | ||||
|  |     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|  | 
 | ||||
|  | 	if ((i*M) < N) { | ||||
|  | 		for (int j = i*M; j < i*M + M; ++j) { | ||||
|  | 			A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | } | ||||
|  | 
 | ||||
|  | extern "C" int cuda_basicAdd(int a, int b) { | ||||
|  | 	int c = 0; | ||||
|  | 	int *dev_c; | ||||
|  | 	cudaMalloc((void**)&dev_c, sizeof(int)); | ||||
|  | 	cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); | ||||
|  | 	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); | ||||
|  | 	//printf("%d + %d + 42 is %d\n", a, b, c); | ||||
|  | 	cudaFree(dev_c); | ||||
|  | 	return c; | ||||
|  | } | ||||
|  | 
 | ||||
|  | void cpp_cuda_bandwidthTest(int entryCount, int N) { | ||||
|  | 	// Size of the Arrays | ||||
|  | 	size_t arraySize = entryCount * sizeof(int); | ||||
|  | 	 | ||||
|  | 	int* deviceIntArray; | ||||
|  | 	int* hostIntArray = new int[arraySize]; | ||||
|  | 
 | ||||
|  | 	// Allocate space on the device | ||||
|  | 	auto start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { | ||||
|  | 			std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; | ||||
|  | 			delete[] hostIntArray; | ||||
|  | 			return; | ||||
|  | 		} | ||||
|  | 		// Free memory on device | ||||
|  | 		if (cudaFree(deviceIntArray) != cudaSuccess) { | ||||
|  | 			std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 			delete[] hostIntArray; | ||||
|  | 			return; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 	auto end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); | ||||
|  | 	double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; | ||||
|  | 	std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; | ||||
|  | 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; | ||||
|  | 
 | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; | ||||
|  | 		delete[] hostIntArray; | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	 | ||||
|  | 	// Prepare data | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		hostIntArray[i] = i * 333 + 123; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	// Copy data TO device | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { | ||||
|  | 			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; | ||||
|  | 			// Free memory on device | ||||
|  | 			if (cudaFree(deviceIntArray) != cudaSuccess) { | ||||
|  | 				std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 			} | ||||
|  | 			delete[] hostIntArray; | ||||
|  | 			return; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); | ||||
|  | 	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; | ||||
|  | 	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; | ||||
|  | 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; | ||||
|  | 
 | ||||
|  | 	// Copy data FROM device | ||||
|  | 	start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { | ||||
|  | 			std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; | ||||
|  | 			// Free memory on device | ||||
|  | 			if (cudaFree(deviceIntArray) != cudaSuccess) { | ||||
|  | 				std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 			} | ||||
|  | 			delete[] hostIntArray; | ||||
|  | 			return; | ||||
|  | 		} | ||||
|  | 	} | ||||
|  | 	end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); | ||||
|  | 	mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; | ||||
|  | 	std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; | ||||
|  | 	std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; | ||||
|  | 
 | ||||
|  | 	// Free memory on device | ||||
|  | 	if (cudaFree(deviceIntArray) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 	} | ||||
|  | 	delete[] hostIntArray; | ||||
|  | } | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { | ||||
|  | 	// Size of the Arrays | ||||
|  | 	size_t arraySize = N * sizeof(int); | ||||
|  | 	 | ||||
|  | 	int* deviceIntArrayA; | ||||
|  | 	int* deviceIntArrayB; | ||||
|  | 	int* deviceIntArrayC; | ||||
|  | 	int* deviceIntArrayD; | ||||
|  | 
 | ||||
|  | 	// Allocate space on the device | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMalloc1!\n"); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMalloc2!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMalloc3!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMalloc4!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		cudaFree(deviceIntArrayC); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	 | ||||
|  | 	// Copy data TO device | ||||
|  | 	if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		cudaFree(deviceIntArrayC); | ||||
|  | 		cudaFree(deviceIntArrayD); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		cudaFree(deviceIntArrayC); | ||||
|  | 		cudaFree(deviceIntArrayD); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		cudaFree(deviceIntArrayC); | ||||
|  | 		cudaFree(deviceIntArrayD); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	 | ||||
|  |     // Festlegung der Threads pro Block | ||||
|  |     int threadsPerBlock = 512; | ||||
|  |     // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können | ||||
|  |     int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | ||||
|  | 
 | ||||
|  | 	// Run kernel | ||||
|  | 	cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); | ||||
|  | 
 | ||||
|  | 	// Copy data FROM device | ||||
|  | 	if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		cudaFree(deviceIntArrayB); | ||||
|  | 		cudaFree(deviceIntArrayC); | ||||
|  | 		cudaFree(deviceIntArrayD); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	// Free memory on device | ||||
|  | 	cudaFree(deviceIntArrayA); | ||||
|  | 	cudaFree(deviceIntArrayB); | ||||
|  | 	cudaFree(deviceIntArrayC); | ||||
|  | 	cudaFree(deviceIntArrayD); | ||||
|  | } | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { | ||||
|  | 	// Size of the Arrays | ||||
|  | 	size_t arraySize = N * sizeof(int) * 4; | ||||
|  | 	 | ||||
|  | 	int* deviceIntArrayA; | ||||
|  | 
 | ||||
|  | 	// Allocate space on the device | ||||
|  | 	if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMalloc1!\n"); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | #define ONFAILFREE0() do { } while(0) | ||||
|  | #define ONFAILFREE1(a) do { cudaFree(a); } while(0) | ||||
|  | #define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) | ||||
|  | #define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) | ||||
|  | #define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) | ||||
|  | #define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) | ||||
|  | 
 | ||||
|  | 	// Copy data TO device | ||||
|  | 
 | ||||
|  | 	CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); | ||||
|  | 
 | ||||
|  | 	/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		return; | ||||
|  | 	}*/ | ||||
|  | 	 | ||||
|  |     // Festlegung der Threads pro Block | ||||
|  |     int threadsPerBlock = 512; | ||||
|  |     // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können | ||||
|  |     int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | ||||
|  | 
 | ||||
|  | 	// Run kernel | ||||
|  | 	cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M); | ||||
|  | 
 | ||||
|  | 	// Copy data FROM device | ||||
|  | 	if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaMemcpy!\n"); | ||||
|  | 		cudaFree(deviceIntArrayA); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	// Free memory on device | ||||
|  | 	if (cudaFree(deviceIntArrayA) != cudaSuccess) { | ||||
|  | 		printf("Error in cudaFree!\n"); | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | } | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		D[i] = A[i] * B[i] + C[i]; | ||||
|  | 	} | ||||
|  | } | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { | ||||
|  | 	for (int i = 0; i < N; i += 4) { | ||||
|  | 		A[i+3] = A[i] * A[i+1] + A[i+2]; | ||||
|  | 	} | ||||
|  | } | ||||
| @ -0,0 +1,9 @@ | |||||
|  | extern "C" int cuda_basicAdd(int a, int b); | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); | ||||
|  | extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); | ||||
|  | 
 | ||||
|  | extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); | ||||
|  | extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); | ||||
|  | 
 | ||||
|  | void cpp_cuda_bandwidthTest(int entryCount, int N); | ||||
| @ -0,0 +1,39 @@ | |||||
|  | #include <iostream> | ||||
|  | #include <chrono> | ||||
|  | 
 | ||||
|  | __global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { | ||||
|  | 	*B = *A; | ||||
|  | } | ||||
|  | 
 | ||||
|  | void kernelSwitchTest(size_t N) { | ||||
|  | 	int* deviceIntA; | ||||
|  | 	int* deviceIntB; | ||||
|  | 
 | ||||
|  | 	if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 
 | ||||
|  | 	// Allocate space on the device | ||||
|  | 	auto start_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	for (int i = 0; i < N; ++i) { | ||||
|  | 		cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); | ||||
|  | 	} | ||||
|  | 	auto end_time = std::chrono::high_resolution_clock::now(); | ||||
|  | 	std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; | ||||
|  | 	std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; | ||||
|  | 
 | ||||
|  | 	// Free memory on device | ||||
|  | 	if (cudaFree(deviceIntA) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | 	if (cudaFree(deviceIntB) != cudaSuccess) { | ||||
|  | 		std::cout << "Error in cudaFree!" << std::endl; | ||||
|  | 		return; | ||||
|  | 	} | ||||
|  | } | ||||
| @ -0,0 +1 @@ | |||||
|  | void kernelSwitchTest(size_t N); | ||||
| @ -0,0 +1,19 @@ | |||||
|  | size_t getFreeCudaMemory() { | ||||
|  | 	size_t freeMemory; | ||||
|  | 	size_t totalMemory; | ||||
|  | 	cudaMemGetInfo(&freeMemory, &totalMemory); | ||||
|  | 
 | ||||
|  | 	return freeMemory; | ||||
|  | } | ||||
|  | 
 | ||||
|  | size_t getTotalCudaMemory() { | ||||
|  | 	size_t freeMemory; | ||||
|  | 	size_t totalMemory; | ||||
|  | 	cudaMemGetInfo(&freeMemory, &totalMemory); | ||||
|  | 
 | ||||
|  | 	return totalMemory; | ||||
|  | } | ||||
|  | 
 | ||||
|  | void resetCudaDevice() { | ||||
|  | 	cudaDeviceReset(); | ||||
|  | } | ||||
| @ -0,0 +1,3 @@ | |||||
|  | size_t getFreeCudaMemory(); | ||||
|  | size_t getTotalCudaMemory(); | ||||
|  | void resetCudaDevice(); | ||||
						Write
						Preview
					
					
					Loading…
					
					Cancel
						Save
					
		Reference in new issue