Browse Source
Added first signs of the CUDA Extension for Storm.
Added first signs of the CUDA Extension for Storm.
Former-commit-id: b02385cd82
tempestpy_adaptions
PBerger
11 years ago
12 changed files with 881 additions and 0 deletions
-
334resources/cudaForStorm/CMakeLists.txt
-
124resources/cudaForStorm/src/cudaTests.h
-
62resources/cudaForStorm/src/main.cpp
-
4resources/cudaForStorm/srcCuda/allCudaKernels.h
-
0resources/cudaForStorm/srcCuda/bandWidth.cu
-
0resources/cudaForStorm/srcCuda/bandWidth.h
-
286resources/cudaForStorm/srcCuda/basicAdd.cu
-
9resources/cudaForStorm/srcCuda/basicAdd.h
-
39resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
-
1resources/cudaForStorm/srcCuda/kernelSwitchTest.h
-
19resources/cudaForStorm/srcCuda/utility.cu
-
3resources/cudaForStorm/srcCuda/utility.h
@ -0,0 +1,334 @@ |
|||||
|
cmake_minimum_required (VERSION 2.8.6) |
||||
|
|
||||
|
# Set project name |
||||
|
project (cudaForStorm CXX C) |
||||
|
|
||||
|
# Set the version number |
||||
|
set (STORM_CPP_VERSION_MAJOR 1) |
||||
|
set (STORM_CPP_VERSION_MINOR 0) |
||||
|
|
||||
|
# Add base folder for better inclusion paths |
||||
|
include_directories("${PROJECT_SOURCE_DIR}") |
||||
|
include_directories("${PROJECT_SOURCE_DIR}/src") |
||||
|
|
||||
|
message(STATUS "CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## CMake options of StoRM |
||||
|
## |
||||
|
############################################################# |
||||
|
option(DEBUG "Sets whether the DEBUG mode is used" ON) |
||||
|
option(USE_POPCNT "Sets whether the popcnt instruction is going to be used." ON) |
||||
|
option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) |
||||
|
option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) |
||||
|
option(ENABLE_GLPK "Sets whether StoRM is built with support for glpk." OFF) |
||||
|
set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") |
||||
|
set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") |
||||
|
set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") |
||||
|
set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Inclusion of required libraries |
||||
|
## |
||||
|
############################################################# |
||||
|
|
||||
|
# Add the resources/cmake folder to Module Search Path for FindTBB.cmake |
||||
|
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") |
||||
|
|
||||
|
find_package(CUDA REQUIRED) |
||||
|
find_package(Doxygen REQUIRED) |
||||
|
find_package(Threads REQUIRED) |
||||
|
|
||||
|
# If the DEBUG option was turned on, we will target a debug version and a release version otherwise |
||||
|
if (DEBUG) |
||||
|
set (CMAKE_BUILD_TYPE "DEBUG") |
||||
|
else() |
||||
|
set (CMAKE_BUILD_TYPE "RELEASE") |
||||
|
endif() |
||||
|
message(STATUS "StoRM - Building ${CMAKE_BUILD_TYPE} version.") |
||||
|
|
||||
|
if ("${GUROBI_ROOT}" STREQUAL "") |
||||
|
set(ENABLE_GUROBI OFF) |
||||
|
else() |
||||
|
set(ENABLE_GUROBI ON) |
||||
|
endif() |
||||
|
|
||||
|
if ("${Z3_ROOT}" STREQUAL "") |
||||
|
set(ENABLE_Z3 OFF) |
||||
|
else() |
||||
|
set(ENABLE_Z3 ON) |
||||
|
set(Z3_LIB_NAME "z3") |
||||
|
endif() |
||||
|
|
||||
|
message(STATUS "StoRM - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") |
||||
|
message(STATUS "StoRM - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## CUDA Options |
||||
|
## |
||||
|
############################################################# |
||||
|
SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) |
||||
|
set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) |
||||
|
set(BUILD_SHARED_LIBS OFF) |
||||
|
set(CUDA_SEPARABLE_COMPILATION ON) |
||||
|
#set(CUDA_NVCC_FLAGS "-arch=sm_30") |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Compiler specific settings and definitions |
||||
|
## |
||||
|
############################################################# |
||||
|
|
||||
|
# Path to the no-strict-aliasing target |
||||
|
set(CONVERSIONHELPER_TARGET "${PROJECT_SOURCE_DIR}/src/utility/ConversionHelper.cpp") |
||||
|
|
||||
|
if(CMAKE_COMPILER_IS_GNUCC) |
||||
|
message(STATUS "StoRM - Using Compiler Configuration: GCC") |
||||
|
# Set standard flags for GCC |
||||
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") |
||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") |
||||
|
# -Werror is atm removed as this gave some problems with existing code |
||||
|
# May be re-set later |
||||
|
# (Thomas Heinemann, 2012-12-21) |
||||
|
|
||||
|
# Turn on popcnt instruction if desired (yes by default) |
||||
|
if (USE_POPCNT) |
||||
|
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") |
||||
|
endif(USE_POPCNT) |
||||
|
|
||||
|
# Set the no-strict-aliasing target for GCC |
||||
|
set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") |
||||
|
elseif(MSVC) |
||||
|
message(STATUS "StoRM - Using Compiler Configuration: MSVC") |
||||
|
# required for GMM to compile, ugly error directive in their code |
||||
|
add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) |
||||
|
# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) |
||||
|
add_definitions(/bigobj) |
||||
|
# required by GTest and PrismGrammar::createIntegerVariable |
||||
|
add_definitions(/D_VARIADIC_MAX=10) |
||||
|
# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max |
||||
|
add_definitions(/DNOMINMAX) |
||||
|
|
||||
|
if(ENABLE_Z3) |
||||
|
set(Z3_LIB_NAME "libz3") |
||||
|
endif() |
||||
|
|
||||
|
# MSVC does not do strict-aliasing, so no option needed |
||||
|
else(CLANG) |
||||
|
message(STATUS "StoRM - Using Compiler Configuration: Clang (LLVM)") |
||||
|
# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. |
||||
|
set (CLANG ON) |
||||
|
# Set standard flags for clang |
||||
|
set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") |
||||
|
if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) |
||||
|
set(CLANG_STDLIB libstdc++) |
||||
|
message(STATUS "StoRM - Linking against libstdc++") |
||||
|
else() |
||||
|
set(CLANG_STDLIB libc++) |
||||
|
message(STATUS "StoRM - Linking against libc++") |
||||
|
# Disable Cotire |
||||
|
set(STORM_USE_COTIRE OFF) |
||||
|
# Set up some Xcode specific settings |
||||
|
set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") |
||||
|
set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") |
||||
|
endif() |
||||
|
|
||||
|
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") |
||||
|
|
||||
|
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") |
||||
|
|
||||
|
# Turn on popcnt instruction if desired (yes by default) |
||||
|
if (USE_POPCNT) |
||||
|
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") |
||||
|
endif(USE_POPCNT) |
||||
|
|
||||
|
# Set the no-strict-aliasing target for Clang |
||||
|
set_source_files_properties(${CONVERSIONHELPER_TARGET} PROPERTIES COMPILE_FLAGS " -fno-strict-aliasing ") |
||||
|
endif() |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## CMake-generated Config File for StoRM |
||||
|
## |
||||
|
############################################################# |
||||
|
# Base path for test files |
||||
|
set(STORM_CPP_TESTS_BASE_PATH "${PROJECT_SOURCE_DIR}/test") |
||||
|
# Gurobi Defines |
||||
|
if (ENABLE_GUROBI) |
||||
|
set(STORM_CPP_GUROBI_DEF "define") |
||||
|
else() |
||||
|
set(STORM_CPP_GUROBI_DEF "undef") |
||||
|
endif() |
||||
|
|
||||
|
# glpk defines |
||||
|
if (ENABLE_GLPK) |
||||
|
set(STORM_CPP_GLPK_DEF "define") |
||||
|
else() |
||||
|
set(STORM_CPP_GLPK_DEF "undef") |
||||
|
endif() |
||||
|
|
||||
|
# Z3 Defines |
||||
|
if (ENABLE_Z3) |
||||
|
set(STORM_CPP_Z3_DEF "define") |
||||
|
else() |
||||
|
set(STORM_CPP_Z3_DEF "undef") |
||||
|
endif() |
||||
|
|
||||
|
# Intel TBB Defines |
||||
|
if (TBB_FOUND AND ENABLE_INTELTBB) |
||||
|
set(STORM_CPP_INTELTBB_DEF "define") |
||||
|
else() |
||||
|
set(STORM_CPP_INTELTBB_DEF "undef") |
||||
|
endif() |
||||
|
|
||||
|
# Configure a header file to pass some of the CMake settings to the source code |
||||
|
configure_file ( |
||||
|
"${PROJECT_SOURCE_DIR}/../../storm-config.h.in" |
||||
|
"${PROJECT_BINARY_DIR}/include/storm-config.h" |
||||
|
) |
||||
|
# Add the binary dir include directory for storm-config.h |
||||
|
include_directories("${PROJECT_BINARY_DIR}/include") |
||||
|
|
||||
|
# Add the main source directory for includes |
||||
|
include_directories("${PROJECT_SOURCE_DIR}/../../src") |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Source file aggregation and clustering |
||||
|
## |
||||
|
############################################################# |
||||
|
file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) |
||||
|
file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) |
||||
|
|
||||
|
file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") |
||||
|
file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") |
||||
|
|
||||
|
# Additional include files like the storm-config.h |
||||
|
file(GLOB_RECURSE STORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) |
||||
|
|
||||
|
# Group the headers and sources |
||||
|
source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) |
||||
|
source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) |
||||
|
|
||||
|
# Add custom additional include or link directories |
||||
|
if (ADDITIONAL_INCLUDE_DIRS) |
||||
|
message(STATUS "StoRM - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") |
||||
|
include_directories(${ADDITIONAL_INCLUDE_DIRS}) |
||||
|
endif(ADDITIONAL_INCLUDE_DIRS) |
||||
|
if (ADDITIONAL_LINK_DIRS) |
||||
|
message(STATUS "StoRM - Using additional link directories ${ADDITIONAL_LINK_DIRS}") |
||||
|
link_directories(${ADDITIONAL_LINK_DIRS}) |
||||
|
endif(ADDITIONAL_LINK_DIRS) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Pre executable-creation link_directories setup |
||||
|
## |
||||
|
############################################################# |
||||
|
if (ENABLE_GUROBI) |
||||
|
link_directories("${GUROBI_ROOT}/lib") |
||||
|
endif() |
||||
|
if (ENABLE_Z3) |
||||
|
link_directories("${Z3_ROOT}/bin") |
||||
|
endif() |
||||
|
if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) |
||||
|
set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") |
||||
|
endif () |
||||
|
link_directories(${Boost_LIBRARY_DIRS}) |
||||
|
if (TBB_FOUND AND ENABLE_INTELTBB) |
||||
|
link_directories(${TBB_LIBRARY_DIRS}) |
||||
|
endif() |
||||
|
|
||||
|
############################################################################### |
||||
|
## # |
||||
|
## Executable Creation # |
||||
|
## # |
||||
|
## All link_directories() calls MUST be made before this point # |
||||
|
## # |
||||
|
############################################################################### |
||||
|
|
||||
|
# Since this will be a library |
||||
|
include (GenerateExportHeader) |
||||
|
|
||||
|
add_library(cudaForStorm STATIC ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) |
||||
|
GENERATE_EXPORT_HEADER( cudaForStorm |
||||
|
BASE_NAME cudaForStorm |
||||
|
EXPORT_MACRO_NAME cudaForStorm_EXPORT |
||||
|
EXPORT_FILE_NAME cudaForStorm_Export.h |
||||
|
STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC |
||||
|
) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## CUDA |
||||
|
## |
||||
|
############################################################# |
||||
|
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) |
||||
|
cuda_add_library(cudaLibrary |
||||
|
${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} |
||||
|
OPTIONS -DSTUFF="" -arch=sm_30 |
||||
|
RELEASE -DNDEBUG |
||||
|
DEBUG -g -DDEBUG |
||||
|
) |
||||
|
target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) |
||||
|
ADD_DEPENDENCIES(cudaForStorm cudaLibrary) |
||||
|
target_link_libraries(cudaForStorm cudaLibrary) |
||||
|
message(STATUS "Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") |
||||
|
include_directories(${CUDA_INCLUDE_DIRS}) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Gurobi (optional) |
||||
|
## |
||||
|
############################################################# |
||||
|
if (ENABLE_GUROBI) |
||||
|
message (STATUS "StoRM - Linking with Gurobi") |
||||
|
include_directories("${GUROBI_ROOT}/include") |
||||
|
target_link_libraries(cudaForStorm "gurobi56") |
||||
|
endif(ENABLE_GUROBI) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## glpk (optional) |
||||
|
## |
||||
|
############################################################# |
||||
|
if (ENABLE_GLPK) |
||||
|
message (STATUS "StoRM - Linking with glpk") |
||||
|
target_link_libraries(cudaForStorm "glpk") |
||||
|
endif(ENABLE_GLPK) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Z3 (optional) |
||||
|
## |
||||
|
############################################################# |
||||
|
if (ENABLE_Z3) |
||||
|
message (STATUS "StoRM - Linking with Z3") |
||||
|
include_directories("${Z3_ROOT}/include") |
||||
|
target_link_libraries(cudaForStorm ${Z3_LIB_NAME}) |
||||
|
endif(ENABLE_Z3) |
||||
|
|
||||
|
############################################################# |
||||
|
## |
||||
|
## Threads |
||||
|
## |
||||
|
############################################################# |
||||
|
include_directories(${THREADS_INCLUDE_DIRS}) |
||||
|
target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT}) |
||||
|
|
||||
|
if (MSVC) |
||||
|
# Add the DebugHelper DLL |
||||
|
set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") |
||||
|
target_link_libraries(cudaForStorm "Dbghelp.lib") |
||||
|
endif(MSVC) |
||||
|
|
||||
|
# Link against libc++abi if requested. May be needed to build on Linux systems using clang. |
||||
|
if (LINK_LIBCXXABI) |
||||
|
message (STATUS "StoRM - Linking against libc++abi.") |
||||
|
target_link_libraries(storm "c++abi") |
||||
|
target_link_libraries(storm-functional-tests "c++abi") |
||||
|
target_link_libraries(storm-performance-tests "c++abi") |
||||
|
endif(LINK_LIBCXXABI) |
@ -0,0 +1,124 @@ |
|||||
|
#include <cuda.h> |
||||
|
#include "srcCuda/allCudaKernels.h" |
||||
|
|
||||
|
#include <iostream> |
||||
|
#include <chrono> |
||||
|
#include <random> |
||||
|
|
||||
|
void cudaShowDevices() { |
||||
|
// Todo |
||||
|
} |
||||
|
|
||||
|
void cudaSimpleAddTest(int a, int b) { |
||||
|
std::cout << "Running cudaSimpleAddTest:" << std::endl; |
||||
|
std::cout << "a = " << a << ", b = " << b << "" << std::endl; |
||||
|
|
||||
|
int c = cuda_basicAdd(a, b); |
||||
|
|
||||
|
std::cout << "Result: " << c << "" << std::endl; |
||||
|
} |
||||
|
|
||||
|
void cudaArrayFmaTest(int N) { |
||||
|
std::cout << "Running cudaArrayFmaTest:" << std::endl; |
||||
|
std::cout << "N is " << N << ", resulting in " << (5 * sizeof(int) * N) << " Bytes of Data." << std::endl; |
||||
|
|
||||
|
std::cout << "Generating random input arrays." << std::endl; |
||||
|
|
||||
|
std::default_random_engine generator; |
||||
|
std::uniform_int_distribution<int> distribution(0, INT32_MAX); |
||||
|
int dice_roll = distribution(generator); |
||||
|
|
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
|
||||
|
int* arrayA = new int[N]; |
||||
|
int* arrayB = new int[N]; |
||||
|
int* arrayC = new int[N]; |
||||
|
int* arrayD = new int[N]; |
||||
|
int* arrayD_CPU = new int[N]; |
||||
|
|
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
//arrayA[i] = distribution(generator); |
||||
|
//arrayB[i] = distribution(generator); |
||||
|
//arrayC[i] = distribution(generator); |
||||
|
arrayA[i] = i * 1000 + 137; |
||||
|
arrayB[i] = i * 7000 + 1537; |
||||
|
arrayC[i] = i * 15000 + 97; |
||||
|
arrayD[i] = 0; |
||||
|
arrayD_CPU[i] = 0; |
||||
|
} |
||||
|
|
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
std::cout << "Running FMA test on CPU." << std::endl; |
||||
|
|
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
cuda_arrayFmaHelper(arrayA, arrayB, arrayC, arrayD_CPU, N); |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
cuda_arrayFma(arrayA, arrayB, arrayC, arrayD, N); |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
int errors = 0; |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (arrayD[i] != arrayD_CPU[i]) { |
||||
|
std::cout << "Error in Entry " << i << ": GPU has " << arrayD[i] << " but CPU has " << arrayD_CPU[i] << "!" << std::endl; |
||||
|
++errors; |
||||
|
} |
||||
|
} |
||||
|
std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; |
||||
|
} |
||||
|
|
||||
|
void cudaArrayFmaOptimizedTest(int N, int M) { |
||||
|
std::cout << "Running cudaArrayFmaTest:" << std::endl; |
||||
|
std::cout << "N is " << N << ", resulting in " << (4 * sizeof(int) * N) << " Bytes of Data." << std::endl; |
||||
|
|
||||
|
size_t freeCudaMemory = getFreeCudaMemory(); |
||||
|
size_t totalCudaMemory = getTotalCudaMemory(); |
||||
|
int freeProzent = static_cast<int>(((double)freeCudaMemory)/((double)totalCudaMemory) * 100); |
||||
|
|
||||
|
std::cout << "CUDA Device has " << freeCudaMemory << " Bytes of " << totalCudaMemory << " Bytes free (" << (freeProzent) << "%)." << std::endl; |
||||
|
|
||||
|
std::cout << "Generating random input arrays." << std::endl; |
||||
|
|
||||
|
std::default_random_engine generator; |
||||
|
std::uniform_int_distribution<int> distribution(0, INT32_MAX); |
||||
|
|
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
|
||||
|
int* arrayA = new int[4 * N]; |
||||
|
int* arrayA_CPU = new int[4 * N]; |
||||
|
|
||||
|
for (int i = 0; i < 4*N; ++i) { |
||||
|
arrayA[i] = i * 1000 + i + (357854878 % (i+1)); |
||||
|
arrayA_CPU[i] = arrayA[i]; |
||||
|
} |
||||
|
|
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "Array generation took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
cuda_arrayFmaOptimizedHelper(arrayA_CPU, N); |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "FMA on CPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
cuda_arrayFmaOptimized(arrayA, N, M); |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "FMA on GPU took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
|
||||
|
int errors = 0; |
||||
|
for (int i = 0; i < N; i+=4) { |
||||
|
if (arrayA[i+3] != arrayA_CPU[i+3]) { |
||||
|
//std::cout << "Error in Entry " << i << ": GPU has " << arrayA[i+3] << " but CPU has " << arrayA_CPU[i+3] << "!" << std::endl; |
||||
|
++errors; |
||||
|
} |
||||
|
} |
||||
|
std::cout << "Checked Arrays for Errors: " << errors << " Errors occured." << std::endl; |
||||
|
|
||||
|
delete[] arrayA; |
||||
|
delete[] arrayA_CPU; |
||||
|
} |
@ -0,0 +1,62 @@ |
|||||
|
#include <stdio.h>
|
||||
|
#include <stdlib.h>
|
||||
|
|
||||
|
#include <iostream>
|
||||
|
#include <chrono>
|
||||
|
#include <random>
|
||||
|
|
||||
|
#include "cudaTests.h"
|
||||
|
|
||||
|
int main(int argc, char **argv){ |
||||
|
resetCudaDevice(); |
||||
|
|
||||
|
int testNumber = 0; |
||||
|
int N = 10000; |
||||
|
int M = 402653184; |
||||
|
if (argc > 1) { |
||||
|
testNumber = atoi(argv[1]); |
||||
|
if (argc > 2) { |
||||
|
N = atoi(argv[2]); |
||||
|
if (argc > 3) { |
||||
|
M = atoi(argv[3]); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
switch (testNumber) { |
||||
|
case 1: |
||||
|
cudaSimpleAddTest(N, M); |
||||
|
break; |
||||
|
case 2: |
||||
|
cudaArrayFmaTest(N); |
||||
|
break; |
||||
|
case 3: |
||||
|
cudaArrayFmaOptimizedTest(N, M); |
||||
|
break; |
||||
|
case 4: |
||||
|
cpp_cuda_bandwidthTest(M, N); |
||||
|
break; |
||||
|
case 5: |
||||
|
kernelSwitchTest(N); |
||||
|
break; |
||||
|
break; |
||||
|
// DEFAULT AND 0
|
||||
|
case 0: |
||||
|
default: |
||||
|
std::cout << "Available functions are:" << std::endl; |
||||
|
std::cout << "0 - Show this overview" << std::endl; |
||||
|
std::cout << "1 - cuda simpleAddTest(N, M)" << std::endl; |
||||
|
std::cout << "2 - cuda arrayFmaTest(N)" << std::endl; |
||||
|
std::cout << "3 - cuda arrayFmaOptimizedTest(N, M)" << std::endl; |
||||
|
std::cout << "4 - cuda bandwidthTest(M, N)" << std::endl; |
||||
|
std::cout << "5 - cuda kernelSwitchTest(N)" << std::endl; |
||||
|
std::cout << std::endl; |
||||
|
std::cout << "Call: " << argv[0] << " Selection [N [M]]" << std::endl; |
||||
|
std::cout << "Defaults:" <<std::endl; |
||||
|
std::cout << "N: 10000" << std::endl; |
||||
|
std::cout << "M: 402653184" << std::endl; |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
@ -0,0 +1,4 @@ |
|||||
|
#include "utility.h" |
||||
|
#include "bandWidth.h" |
||||
|
#include "basicAdd.h" |
||||
|
#include "kernelSwitchTest.h" |
@ -0,0 +1,286 @@ |
|||||
|
#include <cuda.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <stdio.h> |
||||
|
|
||||
|
#include <chrono> |
||||
|
#include <iostream> |
||||
|
|
||||
|
__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { |
||||
|
*c = a + b; |
||||
|
} |
||||
|
|
||||
|
__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
// Fused Multiply Add: |
||||
|
// A * B + C => D |
||||
|
|
||||
|
/* |
||||
|
*Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd |
||||
|
*ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche |
||||
|
*Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente |
||||
|
*des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des |
||||
|
*Threads, der die Funktion gerade ausführt. |
||||
|
*/ |
||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
|
||||
|
if (i < N) { |
||||
|
D[i] = A[i] * B[i] + C[i]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { |
||||
|
// Fused Multiply Add: |
||||
|
// A * B + C => D |
||||
|
|
||||
|
// Layout: |
||||
|
// A B C D A B C D A B C D |
||||
|
|
||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
|
||||
|
if ((i*M) < N) { |
||||
|
for (int j = i*M; j < i*M + M; ++j) { |
||||
|
A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" int cuda_basicAdd(int a, int b) { |
||||
|
int c = 0; |
||||
|
int *dev_c; |
||||
|
cudaMalloc((void**)&dev_c, sizeof(int)); |
||||
|
cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); |
||||
|
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); |
||||
|
//printf("%d + %d + 42 is %d\n", a, b, c); |
||||
|
cudaFree(dev_c); |
||||
|
return c; |
||||
|
} |
||||
|
|
||||
|
void cpp_cuda_bandwidthTest(int entryCount, int N) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = entryCount * sizeof(int); |
||||
|
|
||||
|
int* deviceIntArray; |
||||
|
int* hostIntArray = new int[arraySize]; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; |
||||
|
|
||||
|
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Prepare data |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
hostIntArray[i] = i * 333 + 123; |
||||
|
} |
||||
|
|
||||
|
// Copy data TO device |
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = N * sizeof(int); |
||||
|
|
||||
|
int* deviceIntArrayA; |
||||
|
int* deviceIntArrayB; |
||||
|
int* deviceIntArrayC; |
||||
|
int* deviceIntArrayD; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc1!\n"); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc2!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc3!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc4!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Copy data TO device |
||||
|
if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Festlegung der Threads pro Block |
||||
|
int threadsPerBlock = 512; |
||||
|
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
||||
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
||||
|
|
||||
|
// Run kernel |
||||
|
cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Free memory on device |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = N * sizeof(int) * 4; |
||||
|
|
||||
|
int* deviceIntArrayA; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc1!\n"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
#define ONFAILFREE0() do { } while(0) |
||||
|
#define ONFAILFREE1(a) do { cudaFree(a); } while(0) |
||||
|
#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) |
||||
|
#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) |
||||
|
#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) |
||||
|
#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) |
||||
|
|
||||
|
// Copy data TO device |
||||
|
|
||||
|
CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); |
||||
|
|
||||
|
/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
}*/ |
||||
|
|
||||
|
// Festlegung der Threads pro Block |
||||
|
int threadsPerBlock = 512; |
||||
|
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
||||
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
||||
|
|
||||
|
// Run kernel |
||||
|
cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M); |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArrayA) != cudaSuccess) { |
||||
|
printf("Error in cudaFree!\n"); |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
D[i] = A[i] * B[i] + C[i]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { |
||||
|
for (int i = 0; i < N; i += 4) { |
||||
|
A[i+3] = A[i] * A[i+1] + A[i+2]; |
||||
|
} |
||||
|
} |
@ -0,0 +1,9 @@ |
|||||
|
extern "C" int cuda_basicAdd(int a, int b); |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); |
||||
|
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); |
||||
|
|
||||
|
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
||||
|
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
||||
|
|
||||
|
void cpp_cuda_bandwidthTest(int entryCount, int N); |
@ -0,0 +1,39 @@ |
|||||
|
#include <iostream> |
||||
|
#include <chrono> |
||||
|
|
||||
|
__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { |
||||
|
*B = *A; |
||||
|
} |
||||
|
|
||||
|
void kernelSwitchTest(size_t N) { |
||||
|
int* deviceIntA; |
||||
|
int* deviceIntB; |
||||
|
|
||||
|
if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); |
||||
|
} |
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntA) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
if (cudaFree(deviceIntB) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
} |
@ -0,0 +1 @@ |
|||||
|
void kernelSwitchTest(size_t N); |
@ -0,0 +1,19 @@ |
|||||
|
size_t getFreeCudaMemory() { |
||||
|
size_t freeMemory; |
||||
|
size_t totalMemory; |
||||
|
cudaMemGetInfo(&freeMemory, &totalMemory); |
||||
|
|
||||
|
return freeMemory; |
||||
|
} |
||||
|
|
||||
|
size_t getTotalCudaMemory() { |
||||
|
size_t freeMemory; |
||||
|
size_t totalMemory; |
||||
|
cudaMemGetInfo(&freeMemory, &totalMemory); |
||||
|
|
||||
|
return totalMemory; |
||||
|
} |
||||
|
|
||||
|
void resetCudaDevice() { |
||||
|
cudaDeviceReset(); |
||||
|
} |
@ -0,0 +1,3 @@ |
|||||
|
size_t getFreeCudaMemory(); |
||||
|
size_t getTotalCudaMemory(); |
||||
|
void resetCudaDevice(); |
Write
Preview
Loading…
Cancel
Save
Reference in new issue