From 8ebc0e464052ed1a2b47f59986e091404bd26b2b Mon Sep 17 00:00:00 2001 From: David_Korzeniewski Date: Fri, 20 Feb 2015 15:54:43 +0100 Subject: [PATCH] Final touches on cuda nondeterministic linear equation solver & modelchecker Former-commit-id: c549ae0401838adb8b5f6158f2474dc224232a75 --- CMakeLists.txt | 24 +- .../cudaForStorm/CMakeAlignmentCheck.cpp | 64 -- .../cudaForStorm/CMakeFloatAlignmentCheck.cpp | 31 - resources/cudaForStorm/CMakeLists.txt | 294 ------ .../cudaForStorm/srcCuda/allCudaKernels.h | 6 - resources/cudaForStorm/srcCuda/bandWidth.cu | 0 resources/cudaForStorm/srcCuda/bandWidth.h | 0 resources/cudaForStorm/srcCuda/basicAdd.cu | 286 ------ resources/cudaForStorm/srcCuda/basicAdd.h | 9 - .../srcCuda/basicValueIteration.cu | 879 ------------------ .../srcCuda/basicValueIteration.h | 107 --- resources/cudaForStorm/srcCuda/cudaForStorm.h | 19 - .../cudaForStorm/srcCuda/cuspExtension.h | 49 - .../srcCuda/cuspExtensionDouble.h | 361 ------- .../cudaForStorm/srcCuda/cuspExtensionFloat.h | 375 -------- .../cudaForStorm/srcCuda/kernelSwitchTest.cu | 39 - .../cudaForStorm/srcCuda/kernelSwitchTest.h | 1 - resources/cudaForStorm/srcCuda/utility.cu | 33 - resources/cudaForStorm/srcCuda/utility.h | 12 - resources/cudaForStorm/srcCuda/version.cu | 28 - resources/cudaForStorm/srcCuda/version.h | 16 - .../cudaForStorm/storm-cudaplugin-config.h.in | 21 - ...ogicalValueIterationMdpPrctlModelChecker.h | 4 +- ...onNondeterministicLinearEquationSolver.cpp | 6 + ...tionNondeterministicLinearEquationSolver.h | 2 + src/utility/cli.h | 59 +- ...ValueIterationMdpPrctlModelCheckerTest.cpp | 8 +- 27 files changed, 70 insertions(+), 2663 deletions(-) delete mode 100644 resources/cudaForStorm/CMakeAlignmentCheck.cpp delete mode 100644 resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp delete mode 100644 resources/cudaForStorm/CMakeLists.txt delete mode 100644 resources/cudaForStorm/srcCuda/allCudaKernels.h delete mode 100644 resources/cudaForStorm/srcCuda/bandWidth.cu delete mode 100644 resources/cudaForStorm/srcCuda/bandWidth.h delete mode 100644 resources/cudaForStorm/srcCuda/basicAdd.cu delete mode 100644 resources/cudaForStorm/srcCuda/basicAdd.h delete mode 100644 resources/cudaForStorm/srcCuda/basicValueIteration.cu delete mode 100644 resources/cudaForStorm/srcCuda/basicValueIteration.h delete mode 100644 resources/cudaForStorm/srcCuda/cudaForStorm.h delete mode 100644 resources/cudaForStorm/srcCuda/cuspExtension.h delete mode 100644 resources/cudaForStorm/srcCuda/cuspExtensionDouble.h delete mode 100644 resources/cudaForStorm/srcCuda/cuspExtensionFloat.h delete mode 100644 resources/cudaForStorm/srcCuda/kernelSwitchTest.cu delete mode 100644 resources/cudaForStorm/srcCuda/kernelSwitchTest.h delete mode 100644 resources/cudaForStorm/srcCuda/utility.cu delete mode 100644 resources/cudaForStorm/srcCuda/utility.h delete mode 100644 resources/cudaForStorm/srcCuda/version.cu delete mode 100644 resources/cudaForStorm/srcCuda/version.h delete mode 100644 resources/cudaForStorm/storm-cudaplugin-config.h.in diff --git a/CMakeLists.txt b/CMakeLists.txt index d8983670c..b79fb8d44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,6 @@ option(STORM_USE_INTELTBB "Sets whether the Intel TBB libraries should be used." option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precompiled headers)." OFF) option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) -option(STORM_USE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF) set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") set(CUDA_ROOT "" CACHE STRING "The root directory of CUDA.") @@ -227,11 +226,7 @@ endif() set(STORM_CPP_GLPK_DEF "define") # CUDA Defines -if (STORM_USE_CUDAFORSTORM) - set(STORM_CPP_CUDAFORSTORM_DEF "define") -else() - set(STORM_CPP_CUDAFORSTORM_DEF "undef") -endif() +set(STORM_CPP_CUDAFORSTORM_DEF "undef") # Z3 Defines if (ENABLE_Z3) @@ -492,9 +487,6 @@ else(GMP_FOUND) endif(ENABLE_MSAT) endif(GMP_FOUND) -if (STORM_USE_CUDAFORSTORM) - link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib") -endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") endif () @@ -529,20 +521,6 @@ target_link_libraries(storm ${Boost_LIBRARIES}) #message(STATUS "BOOST_INCLUDE_DIRS is ${Boost_INCLUDE_DIRS}") #message(STATUS "BOOST_LIBRARY_DIRS is ${Boost_LIBRARY_DIRS}") -############################################################# -## -## CUDA For Storm -## -############################################################# -if (STORM_USE_CUDAFORSTORM) - message (STATUS "StoRM - Linking with CudaForStorm") - include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include") - include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm") - target_link_libraries(storm cudaForStorm) - target_link_libraries(storm-functional-tests cudaForStorm) - target_link_libraries(storm-performance-tests cudaForStorm) -endif(STORM_USE_CUDAFORSTORM) - ############################################################# ## ## CUDD diff --git a/resources/cudaForStorm/CMakeAlignmentCheck.cpp b/resources/cudaForStorm/CMakeAlignmentCheck.cpp deleted file mode 100644 index 1dc9b470b..000000000 --- a/resources/cudaForStorm/CMakeAlignmentCheck.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code. - */ - #include - #include - #include - - #define CONTAINER_SIZE 100ul - - template - int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) { - std::vector>* myVector = new std::vector>(); - for (size_t i = 0; i < containerSize; ++i) { - myVector->push_back(std::make_pair(firstValue, secondValue)); - } - size_t myVectorSize = myVector->size(); - IndexType* firstStart = &(myVector->at(0).first); - IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first); - ValueType* secondStart = &(myVector->at(0).second); - ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second); - size_t startOffset = reinterpret_cast(secondStart) - reinterpret_cast(firstStart); - size_t endOffset = reinterpret_cast(secondEnd) - reinterpret_cast(firstEnd); - size_t firstOffset = reinterpret_cast(firstEnd) - reinterpret_cast(firstStart); - size_t secondOffset = reinterpret_cast(secondEnd) - reinterpret_cast(secondStart); - - delete myVector; - myVector = nullptr; - - if (myVectorSize != containerSize) { - return -2; - } - - // Check for alignment: - // Requirement is that the pairs are aligned like: first, second, first, second, first, second, ... - if (sizeof(IndexType) != sizeof(ValueType)) { - return -3; - } - if (startOffset != sizeof(IndexType)) { - return -4; - } - if (endOffset != sizeof(IndexType)) { - return -5; - } - if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { - return -6; - } - if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { - return -7; - } - - return 0; - } - - - int main(int argc, char* argv[]) { - int result = 0; - - result = checkForAlignmentOfPairTypes(CONTAINER_SIZE, 42, 3.14); - if (result != 0) { - return result; - } - - return 0; - } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp deleted file mode 100644 index 7b3b7a8b1..000000000 --- a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries - */ - #include - #include - #include - - #define CONTAINER_SIZE 100ul - -int main(int argc, char* argv[]) { - int result = 0; - - std::vector> myVector; - for (size_t i = 0; i < CONTAINER_SIZE; ++i) { - myVector.push_back(std::make_pair(i, 42.12345f * i)); - } - - char* firstUintPointer = reinterpret_cast(&(myVector.at(0).first)); - char* secondUintPointer = reinterpret_cast(&(myVector.at(1).first)); - ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; - - if (uintDiff == (2 * sizeof(uint_fast64_t))) { - result = 2; - } else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { - result = 3; - } else { - result = -5; - } - - return result; - } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt deleted file mode 100644 index d7d525386..000000000 --- a/resources/cudaForStorm/CMakeLists.txt +++ /dev/null @@ -1,294 +0,0 @@ -cmake_minimum_required (VERSION 2.8.6) - -# Set project name -project (cudaForStorm CXX C) - -# Set the version number -set (STORM_CPP_VERSION_MAJOR 1) -set (STORM_CPP_VERSION_MINOR 0) - -# Add base folder for better inclusion paths -include_directories("${PROJECT_SOURCE_DIR}") -include_directories("${PROJECT_SOURCE_DIR}/src") - -message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") - -############################################################# -## -## CMake options of StoRM -## -############################################################# -option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON) -option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) -option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) - -set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") -set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") -set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).") - -############################################################# -## -## Inclusion of required libraries -## -############################################################# - -# Add the resources/cmake folder to Module Search Path for FindTBB.cmake -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") - -# Set the hint for CUSP -set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary") - -find_package(CUDA REQUIRED) -find_package(Cusp REQUIRED) -find_package(Doxygen REQUIRED) -find_package(Thrust REQUIRED) - -# If the DEBUG option was turned on, we will target a debug version and a release version otherwise -if (CUDAFORSTORM_DEBUG) - set (CMAKE_BUILD_TYPE "DEBUG") -else() - set (CMAKE_BUILD_TYPE "RELEASE") -endif() -message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.") - -message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") -message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") - -############################################################# -## -## CUDA Options -## -############################################################# -SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) -set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) -set(BUILD_SHARED_LIBS OFF) -set(CUDA_SEPARABLE_COMPILATION ON) -#set(CUDA_NVCC_FLAGS "-arch=sm_30") - -# Because the FindCUDA.cmake file has a path related bug, two folders have to be present -file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug") -file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release") - - -############################################################# -## -## Compiler specific settings and definitions -## -############################################################# -if(CMAKE_COMPILER_IS_GNUCC) - message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC") - # Set standard flags for GCC - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") -elseif(MSVC) - message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC") - # required for GMM to compile, ugly error directive in their code - add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) - # required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) - add_definitions(/bigobj) - # required by GTest and PrismGrammar::createIntegerVariable - add_definitions(/D_VARIADIC_MAX=10) - # Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max - add_definitions(/DNOMINMAX) -else(CLANG) - message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)") - # As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. - set (CLANG ON) - # Set standard flags for clang - set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") - if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) - set(CLANG_STDLIB libstdc++) - message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++") - else() - set(CLANG_STDLIB libc++) - message(STATUS "StoRM (CudaPlugin) - Linking against libc++") - # Set up some Xcode specific settings - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") - endif() - - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") - - set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") -endif() - -############################################################# -## -## CMake-generated Config File for StoRM -## -############################################################# - -# Test for type alignment -try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT - ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp" - COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR -) -if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT) - message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") -elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0) - message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.") -else() - message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") -endif() - -# Test for Float 64bit Alignment -try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT - ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp" - COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR -) -if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT) - message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") -elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2) - message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.") - set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define") -elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3) - message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.") - set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef") -else() - message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})") -endif() - - -# -# Make a version file containing the current version from git. -# -include(GetGitRevisionDescription) -git_describe_checkout(STORM_GIT_VERSION_STRING) -# Parse the git Tag into variables -string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}") -string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}") -string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}") -string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}") -string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}") -string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}") -if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$") - set(STORM_CUDAPLUGIN_VERSION_DIRTY 1) -else() - set(STORM_CUDAPLUGIN_VERSION_DIRTY 0) -endif() -message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})") - - -# Configure a header file to pass some of the CMake settings to the source code -configure_file ( - "${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in" - "${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h" -) -# Add the binary dir include directory for storm-config.h -include_directories("${PROJECT_BINARY_DIR}/include") - -# Add the main source directory for includes -include_directories("${PROJECT_SOURCE_DIR}/../../src") - -############################################################# -## -## Source file aggregation and clustering -## -############################################################# -file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) -file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) - -file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") -file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") - -# Additional include files like the storm-config.h -file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) - -# Group the headers and sources -source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) -source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) - -# Add custom additional include or link directories -if (ADDITIONAL_INCLUDE_DIRS) - message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") - include_directories(${ADDITIONAL_INCLUDE_DIRS}) -endif(ADDITIONAL_INCLUDE_DIRS) -if (ADDITIONAL_LINK_DIRS) - message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}") - link_directories(${ADDITIONAL_LINK_DIRS}) -endif(ADDITIONAL_LINK_DIRS) - -############################################################# -## -## Pre executable-creation link_directories setup -## -############################################################# - - - -############################################################# -## -## CUDA -## -############################################################# -#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) -#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) -#ADD_DEPENDENCIES(cudaForStorm cudaLibrary) -#target_link_libraries(cudaForStorm cudaLibrary) -message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") -include_directories(${CUDA_INCLUDE_DIRS}) - -############################################################# -## -## CUSP -## -############################################################# -if(CUSP_FOUND) - include_directories(${CUSP_INCLUDE_DIR}) - cuda_include_directories(${CUSP_INCLUDE_DIR}) - message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}") -else() - message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!") -endif() - -############################################################# -## -## Thrust -## -############################################################# -if(THRUST_FOUND) - include_directories(${THRUST_INCLUDE_DIR}) - cuda_include_directories(${THRUST_INCLUDE_DIR}) - message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}") -else() - message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.") -endif() - -############################################################################### -## # -## Executable Creation # -## # -## All link_directories() calls AND include_directories() calls # -## MUST be made before this point # -## # -############################################################################### -include (GenerateExportHeader) - -cuda_add_library(cudaForStorm SHARED - ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} - OPTIONS -DSTUFF="" -arch=sm_30 - RELEASE -DNDEBUG - DEBUG -g -DDEBUG -) -GENERATE_EXPORT_HEADER( cudaForStorm - BASE_NAME cudaForStorm - EXPORT_MACRO_NAME cudaForStorm_EXPORT - EXPORT_FILE_NAME include/cudaForStorm_Export.h - STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC -) - -if (MSVC) - # Add the DebugHelper DLL - set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") - target_link_libraries(cudaForStorm "Dbghelp.lib") -endif(MSVC) - -# Link against libc++abi if requested. May be needed to build on Linux systems using clang. -if (LINK_LIBCXXABI) - message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.") - target_link_libraries(cudaForStorm "c++abi") -endif(LINK_LIBCXXABI) - -# Install Directive -install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib") -install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/allCudaKernels.h b/resources/cudaForStorm/srcCuda/allCudaKernels.h deleted file mode 100644 index 50bf92191..000000000 --- a/resources/cudaForStorm/srcCuda/allCudaKernels.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "utility.h" -#include "bandWidth.h" -#include "basicAdd.h" -#include "kernelSwitchTest.h" -#include "basicValueIteration.h" -#include "version.h" \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/bandWidth.cu b/resources/cudaForStorm/srcCuda/bandWidth.cu deleted file mode 100644 index e69de29bb..000000000 diff --git a/resources/cudaForStorm/srcCuda/bandWidth.h b/resources/cudaForStorm/srcCuda/bandWidth.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/resources/cudaForStorm/srcCuda/basicAdd.cu b/resources/cudaForStorm/srcCuda/basicAdd.cu deleted file mode 100644 index 88b44e3bf..000000000 --- a/resources/cudaForStorm/srcCuda/basicAdd.cu +++ /dev/null @@ -1,286 +0,0 @@ -#include -#include -#include - -#include -#include - -__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { - *c = a + b; -} - -__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { - // Fused Multiply Add: - // A * B + C => D - - /* - *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd - *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche - *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente - *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des - *Threads, der die Funktion gerade ausführt. - */ - int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < N) { - D[i] = A[i] * B[i] + C[i]; - } -} - -__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { - // Fused Multiply Add: - // A * B + C => D - - // Layout: - // A B C D A B C D A B C D - - int i = blockDim.x * blockIdx.x + threadIdx.x; - - if ((i*M) < N) { - for (int j = i*M; j < i*M + M; ++j) { - A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; - } - } -} - -extern "C" int cuda_basicAdd(int a, int b) { - int c = 0; - int *dev_c; - cudaMalloc((void**)&dev_c, sizeof(int)); - cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); - cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); - //printf("%d + %d + 42 is %d\n", a, b, c); - cudaFree(dev_c); - return c; -} - -void cpp_cuda_bandwidthTest(int entryCount, int N) { - // Size of the Arrays - size_t arraySize = entryCount * sizeof(int); - - int* deviceIntArray; - int* hostIntArray = new int[arraySize]; - - // Allocate space on the device - auto start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; ++i) { - if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; - delete[] hostIntArray; - return; - } - // Free memory on device - if (cudaFree(deviceIntArray) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - delete[] hostIntArray; - return; - } - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto copyTime = std::chrono::duration_cast(end_time - start_time).count(); - double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; - std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; - std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; - - if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; - delete[] hostIntArray; - return; - } - - // Prepare data - for (int i = 0; i < N; ++i) { - hostIntArray[i] = i * 333 + 123; - } - - // Copy data TO device - start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; ++i) { - if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { - std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; - // Free memory on device - if (cudaFree(deviceIntArray) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - } - delete[] hostIntArray; - return; - } - } - end_time = std::chrono::high_resolution_clock::now(); - copyTime = std::chrono::duration_cast(end_time - start_time).count(); - mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; - std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; - std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; - - // Copy data FROM device - start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; ++i) { - if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { - std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; - // Free memory on device - if (cudaFree(deviceIntArray) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - } - delete[] hostIntArray; - return; - } - } - end_time = std::chrono::high_resolution_clock::now(); - copyTime = std::chrono::duration_cast(end_time - start_time).count(); - mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; - std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; - std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; - - // Free memory on device - if (cudaFree(deviceIntArray) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - } - delete[] hostIntArray; -} - -extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { - // Size of the Arrays - size_t arraySize = N * sizeof(int); - - int* deviceIntArrayA; - int* deviceIntArrayB; - int* deviceIntArrayC; - int* deviceIntArrayD; - - // Allocate space on the device - if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { - printf("Error in cudaMalloc1!\n"); - return; - } - if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { - printf("Error in cudaMalloc2!\n"); - cudaFree(deviceIntArrayA); - return; - } - if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { - printf("Error in cudaMalloc3!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - return; - } - if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { - printf("Error in cudaMalloc4!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - return; - } - - // Copy data TO device - if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - cudaFree(deviceIntArrayD); - return; - } - if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - cudaFree(deviceIntArrayD); - return; - } - if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - cudaFree(deviceIntArrayD); - return; - } - - // Festlegung der Threads pro Block - int threadsPerBlock = 512; - // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - // Run kernel - cuda_kernel_arrayFma<<>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); - - // Copy data FROM device - if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - cudaFree(deviceIntArrayD); - return; - } - - // Free memory on device - cudaFree(deviceIntArrayA); - cudaFree(deviceIntArrayB); - cudaFree(deviceIntArrayC); - cudaFree(deviceIntArrayD); -} - -extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { - // Size of the Arrays - size_t arraySize = N * sizeof(int) * 4; - - int* deviceIntArrayA; - - // Allocate space on the device - if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { - printf("Error in cudaMalloc1!\n"); - return; - } - -#define ONFAILFREE0() do { } while(0) -#define ONFAILFREE1(a) do { cudaFree(a); } while(0) -#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) -#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) -#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) -#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) - - // Copy data TO device - - CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); - - /*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - return; - }*/ - - // Festlegung der Threads pro Block - int threadsPerBlock = 512; - // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - // Run kernel - cuda_kernel_arrayFmaOptimized<<>>(deviceIntArrayA, N, M); - - // Copy data FROM device - if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { - printf("Error in cudaMemcpy!\n"); - cudaFree(deviceIntArrayA); - return; - } - - // Free memory on device - if (cudaFree(deviceIntArrayA) != cudaSuccess) { - printf("Error in cudaFree!\n"); - return; - } -} - -extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { - for (int i = 0; i < N; ++i) { - D[i] = A[i] * B[i] + C[i]; - } -} - -extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { - for (int i = 0; i < N; i += 4) { - A[i+3] = A[i] * A[i+1] + A[i+2]; - } -} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicAdd.h b/resources/cudaForStorm/srcCuda/basicAdd.h deleted file mode 100644 index b167244e8..000000000 --- a/resources/cudaForStorm/srcCuda/basicAdd.h +++ /dev/null @@ -1,9 +0,0 @@ -extern "C" int cuda_basicAdd(int a, int b); - -extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); -extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); - -extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); -extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); - -void cpp_cuda_bandwidthTest(int entryCount, int N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu deleted file mode 100644 index 6aa4a2fb4..000000000 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ /dev/null @@ -1,879 +0,0 @@ -#include "basicValueIteration.h" -#define CUSP_USE_TEXTURE_MEMORY - -#include -#include - -#include -#include "cusparse_v2.h" - -#include "utility.h" - -#include "cuspExtension.h" - -#include -#include -#include - -#include "storm-cudaplugin-config.h" - -#ifdef DEBUG -#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) -#else -#define CUDA_CHECK_ALL_ERRORS() do {} while (false) -#endif - -template -struct equalModuloPrecision : public thrust::binary_function -{ -__host__ __device__ T operator()(const T &x, const T &y) const -{ - if (Relative) { - if (y == 0) { - return ((x >= 0) ? (x) : (-x)); - } - const T result = (x - y) / y; - return ((result >= 0) ? (result) : (-result)); - } else { - const T result = (x - y); - return ((result >= 0) ? (result) : (-result)); - } -} -}; - -template -void exploadVector(std::vector> const& inputVector, std::vector& indexVector, std::vector& valueVector) { - indexVector.reserve(inputVector.size()); - valueVector.reserve(inputVector.size()); - for (size_t i = 0; i < inputVector.size(); ++i) { - indexVector.push_back(inputVector.at(i).first); - valueVector.push_back(inputVector.at(i).second); - } -} - -// TEMPLATE VERSION -template -bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - //std::vector matrixColumnIndices; - //std::vector matrixValues; - //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); - bool errorOccured = false; - - IndexType* device_matrixRowIndices = nullptr; - ValueType* device_matrixColIndicesAndValues = nullptr; - ValueType* device_x = nullptr; - ValueType* device_xSwap = nullptr; - ValueType* device_b = nullptr; - ValueType* device_multiplyResult = nullptr; - IndexType* device_nondeterministicChoiceIndices = nullptr; - -#ifdef DEBUG - std::cout.sync_with_stdio(true); - std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; - std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; - size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); - std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; -#endif - - const IndexType matrixRowCount = matrixRowIndices.size() - 1; - const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; - const IndexType matrixNnzCount = columnIndicesAndValues.size(); - - cudaError_t cudaMallocResult; - - bool converged = false; - iterationCount = 0; - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT -#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true -#else -#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false -#endif - if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - } else { - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_xSwap), sizeof(ValueType) * matrixColCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * matrixRowCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished allocating memory." << std::endl; -#endif - - // Memory allocated, copy data to device - cudaError_t cudaCopyResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - - // Copy all data as floats are expanded to 64bits :/ - if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - } else { - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - - // Preset the xSwap to zeros... - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - - // Preset the multiplyResult to zeros... - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; -#endif - - // Data is on device, start Kernel - while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); - CUDA_CHECK_ALL_ERRORS(); - - thrust::device_ptr devicePtrThrust_b(device_b); - thrust::device_ptr devicePtrThrust_multiplyResult(device_multiplyResult); - - // Transform: Add multiplyResult + b inplace to multiplyResult - thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus()); - CUDA_CHECK_ALL_ERRORS(); - - // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); - CUDA_CHECK_ALL_ERRORS(); - - // Check for convergence - // Transform: x = abs(x - xSwap)/ xSwap - thrust::device_ptr devicePtrThrust_x(device_x); - thrust::device_ptr devicePtrThrust_x_end(device_x + matrixColCount); - thrust::device_ptr devicePtrThrust_xSwap(device_xSwap); - thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision()); - CUDA_CHECK_ALL_ERRORS(); - - // Reduce: get Max over x and check for res < Precision - ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits::max(), thrust::maximum()); - CUDA_CHECK_ALL_ERRORS(); - converged = (maxX < precision); - ++iterationCount; - - // Swap pointers, device_x always contains the most current result - std::swap(device_x, device_xSwap); - } - - if (!converged && (iterationCount == maxIterationCount)) { - iterationCount = 0; - errorOccured = true; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished kernel execution." << std::endl; - std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; -#endif - - // Get x back from the device - cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished copying result data." << std::endl; -#endif - - // All code related to freeing memory and clearing up the device -cleanup: - if (device_matrixRowIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_matrixRowIndices = nullptr; - } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_matrixColIndicesAndValues = nullptr; - } - if (device_x != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_x); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_x = nullptr; - } - if (device_xSwap != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_xSwap); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_xSwap = nullptr; - } - if (device_b != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_b); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_b = nullptr; - } - if (device_multiplyResult != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_multiplyResult = nullptr; - } - if (device_nondeterministicChoiceIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; - errorOccured = true; - } - device_nondeterministicChoiceIndices = nullptr; - } -#ifdef DEBUG - std::cout << "(DLL) Finished cleanup." << std::endl; -#endif - - return !errorOccured; -} - -template -void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - IndexType* device_matrixRowIndices = nullptr; - ValueType* device_matrixColIndicesAndValues = nullptr; - ValueType* device_x = nullptr; - ValueType* device_multiplyResult = nullptr; - -#ifdef DEBUG - std::cout.sync_with_stdio(true); - std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl; - std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; - size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); - std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; -#endif - - const IndexType matrixRowCount = matrixRowIndices.size() - 1; - const IndexType matrixNnzCount = columnIndicesAndValues.size(); - - cudaError_t cudaMallocResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } -#else - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } -#endif - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished allocating memory." << std::endl; -#endif - - // Memory allocated, copy data to device - cudaError_t cudaCopyResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } -#else - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } -#endif - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - // Preset the multiplyResult to zeros... - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; -#endif - - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); - CUDA_CHECK_ALL_ERRORS(); - -#ifdef DEBUG - std::cout << "(DLL) Finished kernel execution." << std::endl; -#endif - - // Get result back from the device - cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - -#ifdef DEBUG - std::cout << "(DLL) Finished copying result data." << std::endl; -#endif - - // All code related to freeing memory and clearing up the device -cleanup: - if (device_matrixRowIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixRowIndices = nullptr; - } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixColIndicesAndValues = nullptr; - } - if (device_x != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_x); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; - } - device_x = nullptr; - } - if (device_multiplyResult != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; - } - device_multiplyResult = nullptr; - } -#ifdef DEBUG - std::cout << "(DLL) Finished cleanup." << std::endl; -#endif -} - -template -void basicValueIteration_addVectorsInplace(std::vector& a, std::vector const& b) { - ValueType* device_a = nullptr; - ValueType* device_b = nullptr; - - const size_t vectorSize = std::max(a.size(), b.size()); - - cudaError_t cudaMallocResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_a), sizeof(ValueType) * vectorSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * vectorSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - // Memory allocated, copy data to device - cudaError_t cudaCopyResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - do { - // Transform: Add multiplyResult + b inplace to multiplyResult - thrust::device_ptr devicePtrThrust_a(device_a); - thrust::device_ptr devicePtrThrust_b(device_b); - thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus()); - CUDA_CHECK_ALL_ERRORS(); - } while (false); - - // Get result back from the device - cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - // All code related to freeing memory and clearing up the device -cleanup: - if (device_a != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_a); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; - } - device_a = nullptr; - } - if (device_b != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_b); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; - } - device_b = nullptr; - } -} - -template -void basicValueIteration_reduceGroupedVector(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { - ValueType* device_groupedVector = nullptr; - IndexType* device_grouping = nullptr; - ValueType* device_target = nullptr; - - const size_t groupedSize = groupedVector.size(); - const size_t groupingSize = grouping.size(); - const size_t targetSize = targetVector.size(); - - cudaError_t cudaMallocResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_groupedVector), sizeof(ValueType) * groupedSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_grouping), sizeof(IndexType) * groupingSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_target), sizeof(ValueType) * targetSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - // Memory allocated, copy data to device - cudaError_t cudaCopyResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - do { - // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); - CUDA_CHECK_ALL_ERRORS(); - } while (false); - - // Get result back from the device - cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - // All code related to freeing memory and clearing up the device -cleanup: - if (device_groupedVector != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_groupedVector); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; - } - device_groupedVector = nullptr; - } - if (device_grouping != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_grouping); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; - } - device_grouping = nullptr; - } - if (device_target != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_target); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; - } - device_target = nullptr; - } -} - -template -void basicValueIteration_equalModuloPrecision(std::vector const& x, std::vector const& y, ValueType& maxElement) { - ValueType* device_x = nullptr; - ValueType* device_y = nullptr; - - const size_t vectorSize = x.size(); - - cudaError_t cudaMallocResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * vectorSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_y), sizeof(ValueType) * vectorSize); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - // Memory allocated, copy data to device - cudaError_t cudaCopyResult; - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - do { - // Transform: x = abs(x - xSwap)/ xSwap - thrust::device_ptr devicePtrThrust_x(device_x); - thrust::device_ptr devicePtrThrust_y(device_y); - thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision()); - CUDA_CHECK_ALL_ERRORS(); - - // Reduce: get Max over x and check for res < Precision - maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits::max(), thrust::maximum()); - CUDA_CHECK_ALL_ERRORS(); - } while (false); - - // All code related to freeing memory and clearing up the device -cleanup: - if (device_x != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_x); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; - } - device_x = nullptr; - } - if (device_y != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_y); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; - } - device_y = nullptr; - } -} - -/* - * Declare and implement all exported functions for these Kernels here - * - */ - -void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); -} - -void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { - basicValueIteration_addVectorsInplace(a, b); -} - -void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { - basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); -} - -void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { - basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); -} - -void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement) { - basicValueIteration_equalModuloPrecision(x, y, maxElement); -} - -void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement) { - basicValueIteration_equalModuloPrecision(x, y, maxElement); -} - -// Float -void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); -} - -void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b) { - basicValueIteration_addVectorsInplace(a, b); -} - -void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { - basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); -} - -void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { - basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); -} - -void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement) { - basicValueIteration_equalModuloPrecision(x, y, maxElement); -} - -void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement) { - basicValueIteration_equalModuloPrecision(x, y, maxElement); -} - -bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - if (relativePrecisionCheck) { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } else { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } -} - -bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - if (relativePrecisionCheck) { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } else { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } -} - -bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - if (relativePrecisionCheck) { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } else { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } -} - -bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { - if (relativePrecisionCheck) { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } else { - return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); - } -} - -size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { - size_t const valueTypeSize = sizeof(double); - size_t const indexTypeSize = sizeof(uint_fast64_t); - - /* - IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; - ValueType* device_x = nullptr; - ValueType* device_xSwap = nullptr; - ValueType* device_b = nullptr; - ValueType* device_multiplyResult = nullptr; - IndexType* device_nondeterministicChoiceIndices = nullptr; - */ - - // Row Indices, Column Indices, Values, Choice Indices - size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); - // Vectors x, xSwap, b, multiplyResult - size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); - - return (matrixDataSize + vectorSizes); -} - -size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { - size_t const valueTypeSize = sizeof(float); - size_t const indexTypeSize = sizeof(uint_fast64_t); - - /* - IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; - ValueType* device_x = nullptr; - ValueType* device_xSwap = nullptr; - ValueType* device_b = nullptr; - ValueType* device_multiplyResult = nullptr; - IndexType* device_nondeterministicChoiceIndices = nullptr; - */ - - // Row Indices, Column Indices, Values, Choice Indices - size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); - // Vectors x, xSwap, b, multiplyResult - size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); - - return (matrixDataSize + vectorSizes); -} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h deleted file mode 100644 index 09b4be5ca..000000000 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ -#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ - -#include -#include -#include - -// Library exports -#include "cudaForStorm_Export.h" - -/* Helper declaration to cope with new internal format */ -#ifndef STORM_STORAGE_SPARSEMATRIX_H_ -namespace storm { - namespace storage { -template - class MatrixEntry { - public: - /*! - * Constructs a matrix entry with the given column and value. - * - * @param column The column of the matrix entry. - * @param value The value of the matrix entry. - */ - MatrixEntry(uint_fast64_t column, T value); - - /*! - * Move-constructs the matrix entry fro the given column-value pair. - * - * @param pair The column-value pair from which to move-construct the matrix entry. - */ - MatrixEntry(std::pair&& pair); - - //MatrixEntry() = default; - //MatrixEntry(MatrixEntry const& other) = default; - //MatrixEntry& operator=(MatrixEntry const& other) = default; -#ifndef WINDOWS - //MatrixEntry(MatrixEntry&& other) = default; - //MatrixEntry& operator=(MatrixEntry&& other) = default; -#endif - - /*! - * Retrieves the column of the matrix entry. - * - * @return The column of the matrix entry. - */ - uint_fast64_t const& getColumn() const; - - /*! - * Retrieves the column of the matrix entry. - * - * @return The column of the matrix entry. - */ - uint_fast64_t& getColumn(); - - /*! - * Retrieves the value of the matrix entry. - * - * @return The value of the matrix entry. - */ - T const& getValue() const; - - /*! - * Retrieves the value of the matrix entry. - * - * @return The value of the matrix entry. - */ - T& getValue(); - - /*! - * Retrieves a pair of column and value that characterizes this entry. - * - * @return A column-value pair that characterizes this entry. - */ - std::pair const& getColumnValuePair() const; - - private: - // The actual matrix entry. - std::pair entry; - }; - - } -} -#endif - -cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); -cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); -cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); - -cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); -cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); -cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); - -cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); -cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); -cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); -cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); -cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); -cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); - -cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); -cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b); -cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); -cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); -cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement); -cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement); - -#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h deleted file mode 100644 index 2ea39c2d0..000000000 --- a/resources/cudaForStorm/srcCuda/cudaForStorm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ -#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ - -/* - * List of exported functions in this library - */ - -// TopologicalValueIteration -#include "srcCuda/basicValueIteration.h" - -// Utility Functions -#include "srcCuda/utility.h" - -// Version Information -#include "srcCuda/version.h" - - - -#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h deleted file mode 100644 index 11c673bf9..000000000 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include "cuspExtensionFloat.h" -#include "cuspExtensionDouble.h" - -namespace cusp { -namespace detail { -namespace device { - -template -void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { - // - throw; -} -template <> -void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { - storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} -template <> -void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { - storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} - -template -void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { - // - throw; -} -template <> -void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { - storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); -} -template <> -void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { - storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); -} - -template <> -void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { - storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); -} -template <> -void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { - storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); -} - -} // end namespace device -} // end namespace detail -} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h deleted file mode 100644 index 901df0ae7..000000000 --- a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h +++ /dev/null @@ -1,361 +0,0 @@ -/* - * This is an extension of the original CUSP csr_vector.h SPMV implementation. - * It is based on the Code and incorporates changes as to cope with the details - * of the StoRM code. - * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm - */ - -/* - * Copyright 2008-2009 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -#pragma once - -#include -#include -#include - -#include - -#include - -namespace cusp -{ -namespace detail -{ -namespace device -{ - -////////////////////////////////////////////////////////////////////////////// -// CSR SpMV kernels based on a vector model (one warp per row) -////////////////////////////////////////////////////////////////////////////// -// -// spmv_csr_vector_device -// Each row of the CSR matrix is assigned to a warp. The warp computes -// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with -// the x vector, in parallel. This division of work implies that -// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous -// manner (but generally not aligned). On GT200 these accesses are -// coalesced, unlike kernels based on the one-row-per-thread division of -// work. Since an entire 32-thread warp is assigned to each row, many -// threads will remain idle when their row contains a small number -// of elements. This code relies on implicit synchronization among -// threads in a warp. -// -// spmv_csr_vector_tex_device -// Same as spmv_csr_vector_tex_device, except that the texture cache is -// used for accessing the x vector. -// -// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] - - -template -__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) -__global__ void -storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) -{ - __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals - __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; - - const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; - - const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector - const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index - const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block - const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; - - const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local sum - double sum = 0; - - if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { - sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - - // accumulate local sums - for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); - } - } else { - // accumulate local sums - for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); - } - } - - // store local sum in shared memory - sdata[threadIdx.x] = sum; - - // reduce local sums to row sum - if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; - if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; - if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; - if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; - if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; - - // first thread writes the result - if (thread_lane == 0) - y[row] = sdata[threadIdx.x]; - } -} - -template -__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) -__global__ void -storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) -{ - __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals - __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; - - const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; - - const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector - const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index - const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block - const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; - - const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local Min/Max - double localMinMaxElement = minMaxInitializer; - - if (THREADS_PER_ROW == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - - // accumulate local sums - for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - else - { - // accumulate local sums - for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - - // store local sum in shared memory - sdata[threadIdx.x] = localMinMaxElement; - - // reduce local min/max to row min/max - if (Minimize) { - /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ - - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); - } else { - /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); - } - - // first thread writes the result - if (thread_lane == 0) - x[row] = sdata[threadIdx.x]; - } -} - -template -void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) -{ - double __minMaxInitializer = -std::numeric_limits::max(); - if (Minimize) { - __minMaxInitializer = std::numeric_limits::max(); - } - const double minMaxInitializer = __minMaxInitializer; - - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - storm_cuda_opt_vector_reduce_kernel_double <<>> - (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); -} - -template -void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) -{ - const uint_fast64_t rows_per_group = num_entries / num_rows; - - if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } - - __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); -} - -template -void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - storm_cuda_opt_spmv_csr_vector_kernel_double <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); - - if (UseCache) - unbind_x(x); -} - -void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} - -void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} - -// NON-OPT - -template -void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); - - if (UseCache) - unbind_x(x); -} - -void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); -} - -void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); -} - -} // end namespace device -} // end namespace detail -} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h deleted file mode 100644 index bb9acf78e..000000000 --- a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h +++ /dev/null @@ -1,375 +0,0 @@ -/* - * This is an extension of the original CUSP csr_vector.h SPMV implementation. - * It is based on the Code and incorporates changes as to cope with the details - * of the StoRM code. - * As this is mostly copy & paste, the original license still applies. - */ - -/* - * Copyright 2008-2009 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -#pragma once - -#include -#include -#include - -#include - -#include - -#include "storm-cudaplugin-config.h" - -namespace cusp -{ -namespace detail -{ -namespace device -{ - -////////////////////////////////////////////////////////////////////////////// -// CSR SpMV kernels based on a vector model (one warp per row) -////////////////////////////////////////////////////////////////////////////// -// -// spmv_csr_vector_device -// Each row of the CSR matrix is assigned to a warp. The warp computes -// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with -// the x vector, in parallel. This division of work implies that -// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous -// manner (but generally not aligned). On GT200 these accesses are -// coalesced, unlike kernels based on the one-row-per-thread division of -// work. Since an entire 32-thread warp is assigned to each row, many -// threads will remain idle when their row contains a small number -// of elements. This code relies on implicit synchronization among -// threads in a warp. -// -// spmv_csr_vector_tex_device -// Same as spmv_csr_vector_tex_device, except that the texture cache is -// used for accessing the x vector. -// -// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] - - -template -__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) -__global__ void -storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) -{ - __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals - __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; - - const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; - - const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector - const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index - const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block - const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; - - const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local sum - float sum = 0; - - if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); -#else - sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); -#endif - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - - // accumulate local sums - for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); -#else - sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); -#endif - } - } else { - // accumulate local sums - for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { - //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); -#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); -#else - sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); -#endif - } - } - - // store local sum in shared memory - sdata[threadIdx.x] = sum; - - // reduce local sums to row sum - if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; - if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; - if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; - if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; - if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; - - // first thread writes the result - if (thread_lane == 0) - y[row] = sdata[threadIdx.x]; - } -} - -template -__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) -__global__ void -storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) -{ - __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals - __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; - - const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; - - const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector - const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index - const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block - const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; - - const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local Min/Max - float localMinMaxElement = minMaxInitializer; - - if (THREADS_PER_ROW == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - - // accumulate local sums - for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - else - { - // accumulate local sums - for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = min(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = max(localMinMaxElement, y[jj]); - //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - - // store local sum in shared memory - sdata[threadIdx.x] = localMinMaxElement; - - // reduce local min/max to row min/max - if (Minimize) { - /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ - - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); - } else { - /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); - } - - // first thread writes the result - if (thread_lane == 0) - x[row] = sdata[threadIdx.x]; - } -} - -template -void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) -{ - float __minMaxInitializer = -std::numeric_limits::max(); - if (Minimize) { - __minMaxInitializer = std::numeric_limits::max(); - } - const float minMaxInitializer = __minMaxInitializer; - - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - storm_cuda_opt_vector_reduce_kernel_float <<>> - (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); -} - -template -void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) -{ - const uint_fast64_t rows_per_group = num_entries / num_rows; - - if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } - - __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); -} - -template -void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - storm_cuda_opt_spmv_csr_vector_kernel_float <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); - - if (UseCache) - unbind_x(x); -} - -void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} - -void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); -} - -// NON-OPT - -template -void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); - - if (UseCache) - unbind_x(x); -} - -void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); -} - -void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) -{ - const uint_fast64_t nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); -} - -} // end namespace device -} // end namespace detail -} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu deleted file mode 100644 index 2be10e8ca..000000000 --- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include - -__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { - *B = *A; -} - -void kernelSwitchTest(size_t N) { - int* deviceIntA; - int* deviceIntB; - - if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; - return; - } - if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { - std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; - return; - } - - // Allocate space on the device - auto start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; ++i) { - cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); - } - auto end_time = std::chrono::high_resolution_clock::now(); - std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; - std::cout << "Resulting in " << (std::chrono::duration_cast(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; - - // Free memory on device - if (cudaFree(deviceIntA) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - return; - } - if (cudaFree(deviceIntB) != cudaSuccess) { - std::cout << "Error in cudaFree!" << std::endl; - return; - } -} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h deleted file mode 100644 index dff8a13ff..000000000 --- a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h +++ /dev/null @@ -1 +0,0 @@ -void kernelSwitchTest(size_t N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.cu b/resources/cudaForStorm/srcCuda/utility.cu deleted file mode 100644 index 99165ba07..000000000 --- a/resources/cudaForStorm/srcCuda/utility.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include "utility.h" - -#include - -size_t getFreeCudaMemory() { - size_t freeMemory; - size_t totalMemory; - cudaMemGetInfo(&freeMemory, &totalMemory); - - return freeMemory; -} - -size_t getTotalCudaMemory() { - size_t freeMemory; - size_t totalMemory; - cudaMemGetInfo(&freeMemory, &totalMemory); - - return totalMemory; -} - -bool resetCudaDevice() { - cudaError_t result = cudaDeviceReset(); - return (result == cudaSuccess); -} - -int getRuntimeCudaVersion() { - int result = -1; - cudaError_t errorResult = cudaRuntimeGetVersion(&result); - if (errorResult != cudaSuccess) { - return -1; - } - return result; -} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.h b/resources/cudaForStorm/srcCuda/utility.h deleted file mode 100644 index f3110fbeb..000000000 --- a/resources/cudaForStorm/srcCuda/utility.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef STORM_CUDAFORSTORM_UTILITY_H_ -#define STORM_CUDAFORSTORM_UTILITY_H_ - -// Library exports -#include "cudaForStorm_Export.h" - -cudaForStorm_EXPORT size_t getFreeCudaMemory(); -cudaForStorm_EXPORT size_t getTotalCudaMemory(); -cudaForStorm_EXPORT bool resetCudaDevice(); -cudaForStorm_EXPORT int getRuntimeCudaVersion(); - -#endif // STORM_CUDAFORSTORM_UTILITY_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/version.cu b/resources/cudaForStorm/srcCuda/version.cu deleted file mode 100644 index 3850c895c..000000000 --- a/resources/cudaForStorm/srcCuda/version.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include "version.h" - -#include "storm-cudaplugin-config.h" - -size_t getStormCudaPluginVersionMajor() { - return STORM_CUDAPLUGIN_VERSION_MAJOR; -} - -size_t getStormCudaPluginVersionMinor() { - return STORM_CUDAPLUGIN_VERSION_MINOR; -} - -size_t getStormCudaPluginVersionPatch() { - return STORM_CUDAPLUGIN_VERSION_PATCH; -} - -size_t getStormCudaPluginVersionCommitsAhead() { - return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD; -} - -const char* getStormCudaPluginVersionHash() { - static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH; - return versionHash.c_str(); -} - -bool getStormCudaPluginVersionIsDirty() { - return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0); -} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/version.h b/resources/cudaForStorm/srcCuda/version.h deleted file mode 100644 index de3f4f16c..000000000 --- a/resources/cudaForStorm/srcCuda/version.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef STORM_CUDAFORSTORM_VERSION_H_ -#define STORM_CUDAFORSTORM_VERSION_H_ - -// Library exports -#include "cudaForStorm_Export.h" - -#include - -cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor(); -cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor(); -cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch(); -cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead(); -cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash(); -cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty(); - -#endif // STORM_CUDAFORSTORM_VERSION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in deleted file mode 100644 index 3703d0c81..000000000 --- a/resources/cudaForStorm/storm-cudaplugin-config.h.in +++ /dev/null @@ -1,21 +0,0 @@ -/* - * StoRM - Build-in Options - * - * This file is parsed by CMake during makefile generation - */ - -#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ -#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ - -// Version Information -#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@ // The major version of StoRM -#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@ // The minor version of StoRM -#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@ // The patch version of StoRM -#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ // How many commits passed since the tag was last set -#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on -#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else - -// Whether the size of float in a pair is expanded to 64bit -#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT - -#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 7b9c58d52..ad1937a6d 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -15,7 +15,6 @@ namespace storm { namespace modelchecker { -namespace prctl { /* * An implementation of the SparseMdpPrctlModelChecker interface that uses topoligical value iteration for solving @@ -38,7 +37,7 @@ public: * Copy constructs a SparseMdpPrctlModelChecker from the given model checker. In particular, this means that the newly * constructed model checker will have the model of the given model checker as its associated model. */ - explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) + explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) : SparseMdpPrctlModelChecker(modelchecker) { // Intentionally left empty. } @@ -49,7 +48,6 @@ public: virtual ~TopologicalValueIterationMdpPrctlModelChecker() { } }; -} // namespace prctl } // namespace modelchecker } // namespace storm diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 812634cd0..9f2981e92 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -36,6 +36,12 @@ namespace storm { this->maximalNumberOfIterations = settings.getMaximalIterationCount(); this->precision = settings.getPrecision(); this->relative = (settings.getConvergenceCriterion() == storm::settings::modules::TopologicalValueIterationEquationSolverSettings::ConvergenceCriterion::Relative); + + auto generalSettings = storm::settings::generalSettings(); + this->enableCuda = generalSettings.isCudaSet(); +#ifdef STORM_HAVE_CUDA + STORM_LOG_INFO_COND(this->enableCuda, "Option CUDA was not set, but the topological value iteration solver will use it anyways."); +#endif } template diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h index 03da93a83..4ee64b9f8 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -44,6 +44,8 @@ namespace storm { virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; private: + + bool enableCuda; /*! * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. */ diff --git a/src/utility/cli.h b/src/utility/cli.h index 6427f7dd9..e00c4f612 100644 --- a/src/utility/cli.h +++ b/src/utility/cli.h @@ -25,6 +25,10 @@ #ifdef STORM_HAVE_MSAT # include "mathsat.h" #endif +#ifdef STORM_HAVE_CUDA +#include +#include +#endif #include "log4cplus/logger.h" #include "log4cplus/loggingmacros.h" @@ -60,6 +64,7 @@ log4cplus::Logger printer; #include "src/modelchecker/prctl/SparseDtmcPrctlModelChecker.h" #include "src/modelchecker/reachability/SparseDtmcEliminationModelChecker.h" #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" // Headers for counterexample generation. #include "src/counterexamples/MILPMinimalLabelSetGenerator.h" @@ -151,6 +156,44 @@ namespace storm { std::cout << "Linked with " << msatVersion << "." << std::endl; msat_free(msatVersion); #endif +#ifdef STORM_HAVE_CUDA + int deviceCount = 0; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id == cudaSuccess) + { + std::cout << "Compiled with CUDA support, "; + // This function call returns 0 if there are no CUDA capable devices. + if (deviceCount == 0) + { + std::cout<< "but there are no available device(s) that support CUDA." << std::endl; + } else + { + std::cout << "detected " << deviceCount << " CUDA Capable device(s):" << std::endl; + } + + int dev, driverVersion = 0, runtimeVersion = 0; + + for (dev = 0; dev < deviceCount; ++dev) + { + cudaSetDevice(dev); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + + std::cout << "CUDA Device " << dev << ": \"" << deviceProp.name << "\"" << std::endl; + + // Console log + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + std::cout << " CUDA Driver Version / Runtime Version " << driverVersion / 1000 << "." << (driverVersion % 100) / 10 << " / " << runtimeVersion / 1000 << "." << (runtimeVersion % 100) / 10 << std::endl; + std::cout << " CUDA Capability Major/Minor version number: " << deviceProp.major<<"."<getType() == storm::models::MDP) { - std::shared_ptr> mdp = model->template as>(); - storm::modelchecker::SparseMdpPrctlModelChecker modelchecker(*mdp); - result = modelchecker.check(*formula.get()); + std::shared_ptr> mdp = model->template as>(); +#ifdef STORM_HAVE_CUDA + if (settings.isCudaSet()) { + storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker modelchecker(*mdp); + result = modelchecker.check(*formula.get()); + } else { + storm::modelchecker::SparseMdpPrctlModelChecker modelchecker(*mdp); + result = modelchecker.check(*formula.get()); + } +#else + storm::modelchecker::SparseMdpPrctlModelChecker modelchecker(*mdp); + result = modelchecker.check(*formula.get()); +#endif } if (result) { diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp index d656ad9e4..56d9253f6 100644 --- a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -20,7 +20,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { ASSERT_EQ(mdp->getNumberOfStates(), 169ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); - storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); //storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); auto apFormula = std::make_shared("two"); @@ -138,7 +138,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { // ------------- state rewards -------------- std::shared_ptr> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as>(); - storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp); + storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp); apFormula = std::make_shared("done"); @@ -174,7 +174,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { // -------------------------------- state and transition reward ------------------------ std::shared_ptr> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as>(); - storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); + storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); apFormula = std::make_shared("done"); @@ -214,7 +214,7 @@ TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { ASSERT_EQ(mdp->getNumberOfStates(), 3172ull); ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull); - storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); auto apFormula = std::make_shared("elected"); auto eventuallyFormula = std::make_shared(apFormula);