diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..92b35f18f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "resources/3rdparty/cusplibrary"] + path = resources/3rdparty/cusplibrary + url = https://github.com/cusplibrary/cusplibrary.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f171293a3..d9dd82f16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(STORM_USE_INTELTBB "Sets whether the Intel TBB libraries should be used." option(STORM_USE_COTIRE "Sets whether Cotire should be used (for building precompiled headers)." OFF) option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) +option(STORM_USE_CUDAFORSTORM "Sets whether StoRM is built with its CUDA extension." OFF) set(GUROBI_ROOT "" CACHE STRING "The root directory of Gurobi (if available).") set(Z3_ROOT "" CACHE STRING "The root directory of Z3 (if available).") set(CUDA_ROOT "" CACHE STRING "The root directory of CUDA.") @@ -212,6 +213,13 @@ endif() # glpk defines set(STORM_CPP_GLPK_DEF "define") +# CUDA Defines +if (STORM_USE_CUDAFORSTORM) + set(STORM_CPP_CUDAFORSTORM_DEF "define") +else() + set(STORM_CPP_CUDAFORSTORM_DEF "undef") +endif() + # Z3 Defines if (ENABLE_Z3) set(STORM_CPP_Z3_DEF "define") @@ -352,6 +360,9 @@ endif() if (ENABLE_Z3) link_directories("${Z3_ROOT}/bin") endif() +if (STORM_USE_CUDAFORSTORM) + link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib") +endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") endif () @@ -386,6 +397,20 @@ target_link_libraries(storm ${Boost_LIBRARIES}) #message(STATUS "BOOST_INCLUDE_DIRS is ${Boost_INCLUDE_DIRS}") #message(STATUS "BOOST_LIBRARY_DIRS is ${Boost_LIBRARY_DIRS}") +############################################################# +## +## CUDA For Storm +## +############################################################# +if (STORM_USE_CUDAFORSTORM) + message (STATUS "StoRM - Linking with CudaForStorm") + include_directories("${PROJECT_BINARY_DIR}/cudaForStorm/include") + include_directories("${PROJECT_SOURCE_DIR}/resources/cudaForStorm") + target_link_libraries(storm cudaForStorm) + target_link_libraries(storm-functional-tests cudaForStorm) + target_link_libraries(storm-performance-tests cudaForStorm) +endif(STORM_USE_CUDAFORSTORM) + ############################################################# ## ## CUDD diff --git a/examples/mdp/scc/scc.pctl b/examples/mdp/scc/scc.pctl new file mode 100644 index 000000000..501655389 --- /dev/null +++ b/examples/mdp/scc/scc.pctl @@ -0,0 +1,2 @@ +Pmin=? [ (!statetwo) U end ] +Pmax=? [ (!statetwo) U end ] \ No newline at end of file diff --git a/resources/3rdparty/cusplibrary b/resources/3rdparty/cusplibrary new file mode 160000 index 000000000..d8d7d9e97 --- /dev/null +++ b/resources/3rdparty/cusplibrary @@ -0,0 +1 @@ +Subproject commit d8d7d9e97add8db08ef0ad5c0a7e9929fd83ce3c diff --git a/resources/cmake/FindCusp.cmake b/resources/cmake/FindCusp.cmake new file mode 100644 index 000000000..bda100911 --- /dev/null +++ b/resources/cmake/FindCusp.cmake @@ -0,0 +1,56 @@ +# +# FindCusp +# +# This module finds the CUSP header files and extracts their version. It +# sets the following variables. +# +# CUSP_INCLUDE_DIR - Include directory for cusp header files. (All header +# files will actually be in the cusp subdirectory.) +# CUSP_VERSION - Version of cusp in the form "major.minor.patch". +# +# CUSP_FOUND - Indicates whether Cusp has been found +# + +find_path(CUSP_INCLUDE_DIR + HINTS + /usr/include/cusp + /usr/local/include + /usr/local/cusp/include + ${CUSP_INCLUDE_DIRS} + ${CUSP_HINT} + NAMES cusp/version.h + DOC "Cusp headers" +) +if(CUSP_INCLUDE_DIR) + list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR) + + # Find cusp version + file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h + version + REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)" + ) + string(REGEX REPLACE + "#define CUSP_VERSION[ \t]+" + "" + version + "${version}" + ) + + #define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000) + #define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000) + #define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100) + + math(EXPR CUSP_MAJOR_VERSION "${version} / 100000") + math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000") + math(EXPR CUSP_PATCH_VERSION "${version} % 100") + + set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}") + + # Check for required components + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION) + + set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR}) + mark_as_advanced(CUSP_INCLUDE_DIR) + +endif(CUSP_INCLUDE_DIR) \ No newline at end of file diff --git a/resources/cmake/FindThrust.cmake b/resources/cmake/FindThrust.cmake new file mode 100644 index 000000000..8f811bda3 --- /dev/null +++ b/resources/cmake/FindThrust.cmake @@ -0,0 +1,52 @@ +# +# FindThrust +# +# This module finds the Thrust header files and extracts their version. It +# sets the following variables. +# +# THRUST_INCLUDE_DIR - Include directory for thrust header files. (All header +# files will actually be in the thrust subdirectory.) +# THRUST_VERSION - Version of thrust in the form "major.minor.patch". +# +# Thrust_FOUND - Indicates whether Thrust has been found +# + +find_path(THRUST_INCLUDE_DIR + HINTS + /usr/include/cuda + /usr/local/include + /usr/local/cuda/include + ${CUDA_INCLUDE_DIRS} + NAMES thrust/version.h + DOC "Thrust headers" +) +if(THRUST_INCLUDE_DIR) + list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR) +endif(THRUST_INCLUDE_DIR) + +# Find thrust version +file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h + version + REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)" +) +string(REGEX REPLACE + "#define THRUST_VERSION[ \t]+" + "" + version + "${version}" +) + +string(REGEX MATCH "^[0-9]" major ${version}) +string(REGEX REPLACE "^${major}00" "" version "${version}") +string(REGEX MATCH "^[0-9]" minor ${version}) +string(REGEX REPLACE "^${minor}0" "" version "${version}") +set(THRUST_VERSION "${major}.${minor}.${version}") +set(THRUST_MAJOR_VERSION "${major}") +set(THRUST_MINOR_VERSION "${minor}") + +# Check for required components +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION) + +set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR}) +mark_as_advanced(THRUST_INCLUDE_DIR) \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeAlignmentCheck.cpp b/resources/cudaForStorm/CMakeAlignmentCheck.cpp new file mode 100644 index 000000000..1dc9b470b --- /dev/null +++ b/resources/cudaForStorm/CMakeAlignmentCheck.cpp @@ -0,0 +1,64 @@ +/* + * This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code. + */ + #include + #include + #include + + #define CONTAINER_SIZE 100ul + + template + int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) { + std::vector>* myVector = new std::vector>(); + for (size_t i = 0; i < containerSize; ++i) { + myVector->push_back(std::make_pair(firstValue, secondValue)); + } + size_t myVectorSize = myVector->size(); + IndexType* firstStart = &(myVector->at(0).first); + IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first); + ValueType* secondStart = &(myVector->at(0).second); + ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second); + size_t startOffset = reinterpret_cast(secondStart) - reinterpret_cast(firstStart); + size_t endOffset = reinterpret_cast(secondEnd) - reinterpret_cast(firstEnd); + size_t firstOffset = reinterpret_cast(firstEnd) - reinterpret_cast(firstStart); + size_t secondOffset = reinterpret_cast(secondEnd) - reinterpret_cast(secondStart); + + delete myVector; + myVector = nullptr; + + if (myVectorSize != containerSize) { + return -2; + } + + // Check for alignment: + // Requirement is that the pairs are aligned like: first, second, first, second, first, second, ... + if (sizeof(IndexType) != sizeof(ValueType)) { + return -3; + } + if (startOffset != sizeof(IndexType)) { + return -4; + } + if (endOffset != sizeof(IndexType)) { + return -5; + } + if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { + return -6; + } + if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { + return -7; + } + + return 0; + } + + + int main(int argc, char* argv[]) { + int result = 0; + + result = checkForAlignmentOfPairTypes(CONTAINER_SIZE, 42, 3.14); + if (result != 0) { + return result; + } + + return 0; + } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp new file mode 100644 index 000000000..7b3b7a8b1 --- /dev/null +++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp @@ -0,0 +1,31 @@ +/* + * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries + */ + #include + #include + #include + + #define CONTAINER_SIZE 100ul + +int main(int argc, char* argv[]) { + int result = 0; + + std::vector> myVector; + for (size_t i = 0; i < CONTAINER_SIZE; ++i) { + myVector.push_back(std::make_pair(i, 42.12345f * i)); + } + + char* firstUintPointer = reinterpret_cast(&(myVector.at(0).first)); + char* secondUintPointer = reinterpret_cast(&(myVector.at(1).first)); + ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; + + if (uintDiff == (2 * sizeof(uint_fast64_t))) { + result = 2; + } else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { + result = 3; + } else { + result = -5; + } + + return result; + } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt new file mode 100644 index 000000000..d7d525386 --- /dev/null +++ b/resources/cudaForStorm/CMakeLists.txt @@ -0,0 +1,294 @@ +cmake_minimum_required (VERSION 2.8.6) + +# Set project name +project (cudaForStorm CXX C) + +# Set the version number +set (STORM_CPP_VERSION_MAJOR 1) +set (STORM_CPP_VERSION_MINOR 0) + +# Add base folder for better inclusion paths +include_directories("${PROJECT_SOURCE_DIR}") +include_directories("${PROJECT_SOURCE_DIR}/src") + +message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") + +############################################################# +## +## CMake options of StoRM +## +############################################################# +option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON) +option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) +option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) + +set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") +set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") +set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).") + +############################################################# +## +## Inclusion of required libraries +## +############################################################# + +# Add the resources/cmake folder to Module Search Path for FindTBB.cmake +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") + +# Set the hint for CUSP +set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary") + +find_package(CUDA REQUIRED) +find_package(Cusp REQUIRED) +find_package(Doxygen REQUIRED) +find_package(Thrust REQUIRED) + +# If the DEBUG option was turned on, we will target a debug version and a release version otherwise +if (CUDAFORSTORM_DEBUG) + set (CMAKE_BUILD_TYPE "DEBUG") +else() + set (CMAKE_BUILD_TYPE "RELEASE") +endif() +message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.") + +message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") + +############################################################# +## +## CUDA Options +## +############################################################# +SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) +set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) +set(BUILD_SHARED_LIBS OFF) +set(CUDA_SEPARABLE_COMPILATION ON) +#set(CUDA_NVCC_FLAGS "-arch=sm_30") + +# Because the FindCUDA.cmake file has a path related bug, two folders have to be present +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug") +file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release") + + +############################################################# +## +## Compiler specific settings and definitions +## +############################################################# +if(CMAKE_COMPILER_IS_GNUCC) + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC") + # Set standard flags for GCC + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") +elseif(MSVC) + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC") + # required for GMM to compile, ugly error directive in their code + add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) + # required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) + add_definitions(/bigobj) + # required by GTest and PrismGrammar::createIntegerVariable + add_definitions(/D_VARIADIC_MAX=10) + # Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max + add_definitions(/DNOMINMAX) +else(CLANG) + message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)") + # As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. + set (CLANG ON) + # Set standard flags for clang + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") + if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) + set(CLANG_STDLIB libstdc++) + message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++") + else() + set(CLANG_STDLIB libc++) + message(STATUS "StoRM (CudaPlugin) - Linking against libc++") + # Set up some Xcode specific settings + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") + endif() + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") + + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") +endif() + +############################################################# +## +## CMake-generated Config File for StoRM +## +############################################################# + +# Test for type alignment +try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT + ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp" + COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR +) +if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT) + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") +elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0) + message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") +endif() + +# Test for Float 64bit Alignment +try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT + ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp" + COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR +) +if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT) + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})") +endif() + + +# +# Make a version file containing the current version from git. +# +include(GetGitRevisionDescription) +git_describe_checkout(STORM_GIT_VERSION_STRING) +# Parse the git Tag into variables +string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}") +string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}") +if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$") + set(STORM_CUDAPLUGIN_VERSION_DIRTY 1) +else() + set(STORM_CUDAPLUGIN_VERSION_DIRTY 0) +endif() +message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})") + + +# Configure a header file to pass some of the CMake settings to the source code +configure_file ( + "${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in" + "${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h" +) +# Add the binary dir include directory for storm-config.h +include_directories("${PROJECT_BINARY_DIR}/include") + +# Add the main source directory for includes +include_directories("${PROJECT_SOURCE_DIR}/../../src") + +############################################################# +## +## Source file aggregation and clustering +## +############################################################# +file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) +file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) + +file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") +file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") + +# Additional include files like the storm-config.h +file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) + +# Group the headers and sources +source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) +source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) + +# Add custom additional include or link directories +if (ADDITIONAL_INCLUDE_DIRS) + message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") + include_directories(${ADDITIONAL_INCLUDE_DIRS}) +endif(ADDITIONAL_INCLUDE_DIRS) +if (ADDITIONAL_LINK_DIRS) + message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}") + link_directories(${ADDITIONAL_LINK_DIRS}) +endif(ADDITIONAL_LINK_DIRS) + +############################################################# +## +## Pre executable-creation link_directories setup +## +############################################################# + + + +############################################################# +## +## CUDA +## +############################################################# +#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) +#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) +#ADD_DEPENDENCIES(cudaForStorm cudaLibrary) +#target_link_libraries(cudaForStorm cudaLibrary) +message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") +include_directories(${CUDA_INCLUDE_DIRS}) + +############################################################# +## +## CUSP +## +############################################################# +if(CUSP_FOUND) + include_directories(${CUSP_INCLUDE_DIR}) + cuda_include_directories(${CUSP_INCLUDE_DIR}) + message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!") +endif() + +############################################################# +## +## Thrust +## +############################################################# +if(THRUST_FOUND) + include_directories(${THRUST_INCLUDE_DIR}) + cuda_include_directories(${THRUST_INCLUDE_DIR}) + message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.") +endif() + +############################################################################### +## # +## Executable Creation # +## # +## All link_directories() calls AND include_directories() calls # +## MUST be made before this point # +## # +############################################################################### +include (GenerateExportHeader) + +cuda_add_library(cudaForStorm SHARED + ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} + OPTIONS -DSTUFF="" -arch=sm_30 + RELEASE -DNDEBUG + DEBUG -g -DDEBUG +) +GENERATE_EXPORT_HEADER( cudaForStorm + BASE_NAME cudaForStorm + EXPORT_MACRO_NAME cudaForStorm_EXPORT + EXPORT_FILE_NAME include/cudaForStorm_Export.h + STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC +) + +if (MSVC) + # Add the DebugHelper DLL + set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") + target_link_libraries(cudaForStorm "Dbghelp.lib") +endif(MSVC) + +# Link against libc++abi if requested. May be needed to build on Linux systems using clang. +if (LINK_LIBCXXABI) + message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.") + target_link_libraries(cudaForStorm "c++abi") +endif(LINK_LIBCXXABI) + +# Install Directive +install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib") +install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/allCudaKernels.h b/resources/cudaForStorm/srcCuda/allCudaKernels.h new file mode 100644 index 000000000..50bf92191 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/allCudaKernels.h @@ -0,0 +1,6 @@ +#include "utility.h" +#include "bandWidth.h" +#include "basicAdd.h" +#include "kernelSwitchTest.h" +#include "basicValueIteration.h" +#include "version.h" \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/bandWidth.cu b/resources/cudaForStorm/srcCuda/bandWidth.cu new file mode 100644 index 000000000..e69de29bb diff --git a/resources/cudaForStorm/srcCuda/bandWidth.h b/resources/cudaForStorm/srcCuda/bandWidth.h new file mode 100644 index 000000000..e69de29bb diff --git a/resources/cudaForStorm/srcCuda/basicAdd.cu b/resources/cudaForStorm/srcCuda/basicAdd.cu new file mode 100644 index 000000000..88b44e3bf --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicAdd.cu @@ -0,0 +1,286 @@ +#include +#include +#include + +#include +#include + +__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { + *c = a + b; +} + +__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + // Fused Multiply Add: + // A * B + C => D + + /* + *Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd + *ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche + *Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente + *des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des + *Threads, der die Funktion gerade ausführt. + */ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < N) { + D[i] = A[i] * B[i] + C[i]; + } +} + +__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { + // Fused Multiply Add: + // A * B + C => D + + // Layout: + // A B C D A B C D A B C D + + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if ((i*M) < N) { + for (int j = i*M; j < i*M + M; ++j) { + A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; + } + } +} + +extern "C" int cuda_basicAdd(int a, int b) { + int c = 0; + int *dev_c; + cudaMalloc((void**)&dev_c, sizeof(int)); + cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); + cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); + //printf("%d + %d + 42 is %d\n", a, b, c); + cudaFree(dev_c); + return c; +} + +void cpp_cuda_bandwidthTest(int entryCount, int N) { + // Size of the Arrays + size_t arraySize = entryCount * sizeof(int); + + int* deviceIntArray; + int* hostIntArray = new int[arraySize]; + + // Allocate space on the device + auto start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; + delete[] hostIntArray; + return; + } + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + delete[] hostIntArray; + return; + } + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto copyTime = std::chrono::duration_cast(end_time - start_time).count(); + double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; + + if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; + delete[] hostIntArray; + return; + } + + // Prepare data + for (int i = 0; i < N; ++i) { + hostIntArray[i] = i * 333 + 123; + } + + // Copy data TO device + start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; + return; + } + } + end_time = std::chrono::high_resolution_clock::now(); + copyTime = std::chrono::duration_cast(end_time - start_time).count(); + mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; + + // Copy data FROM device + start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; + return; + } + } + end_time = std::chrono::high_resolution_clock::now(); + copyTime = std::chrono::duration_cast(end_time - start_time).count(); + mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; + std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; + std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; + + // Free memory on device + if (cudaFree(deviceIntArray) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + } + delete[] hostIntArray; +} + +extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + // Size of the Arrays + size_t arraySize = N * sizeof(int); + + int* deviceIntArrayA; + int* deviceIntArrayB; + int* deviceIntArrayC; + int* deviceIntArrayD; + + // Allocate space on the device + if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc1!\n"); + return; + } + if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc2!\n"); + cudaFree(deviceIntArrayA); + return; + } + if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc3!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + return; + } + if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc4!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + return; + } + + // Copy data TO device + if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + + // Festlegung der Threads pro Block + int threadsPerBlock = 512; + // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + // Run kernel + cuda_kernel_arrayFma<<>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); + + // Copy data FROM device + if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); + return; + } + + // Free memory on device + cudaFree(deviceIntArrayA); + cudaFree(deviceIntArrayB); + cudaFree(deviceIntArrayC); + cudaFree(deviceIntArrayD); +} + +extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { + // Size of the Arrays + size_t arraySize = N * sizeof(int) * 4; + + int* deviceIntArrayA; + + // Allocate space on the device + if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { + printf("Error in cudaMalloc1!\n"); + return; + } + +#define ONFAILFREE0() do { } while(0) +#define ONFAILFREE1(a) do { cudaFree(a); } while(0) +#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) +#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) +#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) +#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) + + // Copy data TO device + + CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); + + /*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + return; + }*/ + + // Festlegung der Threads pro Block + int threadsPerBlock = 512; + // Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + // Run kernel + cuda_kernel_arrayFmaOptimized<<>>(deviceIntArrayA, N, M); + + // Copy data FROM device + if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { + printf("Error in cudaMemcpy!\n"); + cudaFree(deviceIntArrayA); + return; + } + + // Free memory on device + if (cudaFree(deviceIntArrayA) != cudaSuccess) { + printf("Error in cudaFree!\n"); + return; + } +} + +extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { + for (int i = 0; i < N; ++i) { + D[i] = A[i] * B[i] + C[i]; + } +} + +extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { + for (int i = 0; i < N; i += 4) { + A[i+3] = A[i] * A[i+1] + A[i+2]; + } +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicAdd.h b/resources/cudaForStorm/srcCuda/basicAdd.h new file mode 100644 index 000000000..b167244e8 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicAdd.h @@ -0,0 +1,9 @@ +extern "C" int cuda_basicAdd(int a, int b); + +extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); +extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); + +extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); +extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); + +void cpp_cuda_bandwidthTest(int entryCount, int N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu new file mode 100644 index 000000000..6aa4a2fb4 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -0,0 +1,879 @@ +#include "basicValueIteration.h" +#define CUSP_USE_TEXTURE_MEMORY + +#include +#include + +#include +#include "cusparse_v2.h" + +#include "utility.h" + +#include "cuspExtension.h" + +#include +#include +#include + +#include "storm-cudaplugin-config.h" + +#ifdef DEBUG +#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) +#else +#define CUDA_CHECK_ALL_ERRORS() do {} while (false) +#endif + +template +struct equalModuloPrecision : public thrust::binary_function +{ +__host__ __device__ T operator()(const T &x, const T &y) const +{ + if (Relative) { + if (y == 0) { + return ((x >= 0) ? (x) : (-x)); + } + const T result = (x - y) / y; + return ((result >= 0) ? (result) : (-result)); + } else { + const T result = (x - y); + return ((result >= 0) ? (result) : (-result)); + } +} +}; + +template +void exploadVector(std::vector> const& inputVector, std::vector& indexVector, std::vector& valueVector) { + indexVector.reserve(inputVector.size()); + valueVector.reserve(inputVector.size()); + for (size_t i = 0; i < inputVector.size(); ++i) { + indexVector.push_back(inputVector.at(i).first); + valueVector.push_back(inputVector.at(i).second); + } +} + +// TEMPLATE VERSION +template +bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + //std::vector matrixColumnIndices; + //std::vector matrixValues; + //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + bool errorOccured = false; + + IndexType* device_matrixRowIndices = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + +#ifdef DEBUG + std::cout.sync_with_stdio(true); + std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; + size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); + std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; +#endif + + const IndexType matrixRowCount = matrixRowIndices.size() - 1; + const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; + const IndexType matrixNnzCount = columnIndicesAndValues.size(); + + cudaError_t cudaMallocResult; + + bool converged = false; + iterationCount = 0; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true +#else +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false +#endif + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_xSwap), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * matrixRowCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished allocating memory." << std::endl; +#endif + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + + // Copy all data as floats are expanded to 64bits :/ + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + + // Preset the xSwap to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + + // Preset the multiplyResult to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; +#endif + + // Data is on device, start Kernel + while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::device_ptr devicePtrThrust_multiplyResult(device_multiplyResult); + + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + // Check for convergence + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_x_end(device_x + matrixColCount); + thrust::device_ptr devicePtrThrust_xSwap(device_xSwap); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: get Max over x and check for res < Precision + ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits::max(), thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + converged = (maxX < precision); + ++iterationCount; + + // Swap pointers, device_x always contains the most current result + std::swap(device_x, device_xSwap); + } + + if (!converged && (iterationCount == maxIterationCount)) { + iterationCount = 0; + errorOccured = true; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; + std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; +#endif + + // Get x back from the device + cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished copying result data." << std::endl; +#endif + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_matrixRowIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_matrixRowIndices = nullptr; + } + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_matrixColIndicesAndValues = nullptr; + } + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_x = nullptr; + } + if (device_xSwap != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_xSwap); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_xSwap = nullptr; + } + if (device_b != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_b); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_b = nullptr; + } + if (device_multiplyResult != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_multiplyResult = nullptr; + } + if (device_nondeterministicChoiceIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; + errorOccured = true; + } + device_nondeterministicChoiceIndices = nullptr; + } +#ifdef DEBUG + std::cout << "(DLL) Finished cleanup." << std::endl; +#endif + + return !errorOccured; +} + +template +void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + IndexType* device_matrixRowIndices = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_multiplyResult = nullptr; + +#ifdef DEBUG + std::cout.sync_with_stdio(true); + std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl; + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); + std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; +#endif + + const IndexType matrixRowCount = matrixRowIndices.size() - 1; + const IndexType matrixNnzCount = columnIndicesAndValues.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } +#else + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } +#endif + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished allocating memory." << std::endl; +#endif + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } +#else + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } +#endif + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // Preset the multiplyResult to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; +#endif + + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + +#ifdef DEBUG + std::cout << "(DLL) Finished kernel execution." << std::endl; +#endif + + // Get result back from the device + cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + +#ifdef DEBUG + std::cout << "(DLL) Finished copying result data." << std::endl; +#endif + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_matrixRowIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixRowIndices = nullptr; + } + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixColIndicesAndValues = nullptr; + } + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_multiplyResult != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; + } + device_multiplyResult = nullptr; + } +#ifdef DEBUG + std::cout << "(DLL) Finished cleanup." << std::endl; +#endif +} + +template +void basicValueIteration_addVectorsInplace(std::vector& a, std::vector const& b) { + ValueType* device_a = nullptr; + ValueType* device_b = nullptr; + + const size_t vectorSize = std::max(a.size(), b.size()); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_a), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::device_ptr devicePtrThrust_a(device_a); + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_a != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_a); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; + } + device_a = nullptr; + } + if (device_b != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_b); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + } + device_b = nullptr; + } +} + +template +void basicValueIteration_reduceGroupedVector(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + ValueType* device_groupedVector = nullptr; + IndexType* device_grouping = nullptr; + ValueType* device_target = nullptr; + + const size_t groupedSize = groupedVector.size(); + const size_t groupingSize = grouping.size(); + const size_t targetSize = targetVector.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_groupedVector), sizeof(ValueType) * groupedSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_grouping), sizeof(IndexType) * groupingSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_target), sizeof(ValueType) * targetSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_groupedVector != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_groupedVector); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; + } + device_groupedVector = nullptr; + } + if (device_grouping != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_grouping); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; + } + device_grouping = nullptr; + } + if (device_target != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_target); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; + } + device_target = nullptr; + } +} + +template +void basicValueIteration_equalModuloPrecision(std::vector const& x, std::vector const& y, ValueType& maxElement) { + ValueType* device_x = nullptr; + ValueType* device_y = nullptr; + + const size_t vectorSize = x.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_y), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_y(device_y); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: get Max over x and check for res < Precision + maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits::max(), thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_y != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_y); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; + } + device_y = nullptr; + } +} + +/* + * Declare and implement all exported functions for these Kernels here + * + */ + +void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); +} + +void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +// Float +void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); +} + +void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(double); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + + return (matrixDataSize + vectorSizes); +} + +size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(float); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + + return (matrixDataSize + vectorSizes); +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h new file mode 100644 index 000000000..09b4be5ca --- /dev/null +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -0,0 +1,107 @@ +#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ +#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ + +#include +#include +#include + +// Library exports +#include "cudaForStorm_Export.h" + +/* Helper declaration to cope with new internal format */ +#ifndef STORM_STORAGE_SPARSEMATRIX_H_ +namespace storm { + namespace storage { +template + class MatrixEntry { + public: + /*! + * Constructs a matrix entry with the given column and value. + * + * @param column The column of the matrix entry. + * @param value The value of the matrix entry. + */ + MatrixEntry(uint_fast64_t column, T value); + + /*! + * Move-constructs the matrix entry fro the given column-value pair. + * + * @param pair The column-value pair from which to move-construct the matrix entry. + */ + MatrixEntry(std::pair&& pair); + + //MatrixEntry() = default; + //MatrixEntry(MatrixEntry const& other) = default; + //MatrixEntry& operator=(MatrixEntry const& other) = default; +#ifndef WINDOWS + //MatrixEntry(MatrixEntry&& other) = default; + //MatrixEntry& operator=(MatrixEntry&& other) = default; +#endif + + /*! + * Retrieves the column of the matrix entry. + * + * @return The column of the matrix entry. + */ + uint_fast64_t const& getColumn() const; + + /*! + * Retrieves the column of the matrix entry. + * + * @return The column of the matrix entry. + */ + uint_fast64_t& getColumn(); + + /*! + * Retrieves the value of the matrix entry. + * + * @return The value of the matrix entry. + */ + T const& getValue() const; + + /*! + * Retrieves the value of the matrix entry. + * + * @return The value of the matrix entry. + */ + T& getValue(); + + /*! + * Retrieves a pair of column and value that characterizes this entry. + * + * @return A column-value pair that characterizes this entry. + */ + std::pair const& getColumnValuePair() const; + + private: + // The actual matrix entry. + std::pair entry; + }; + + } +} +#endif + +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); + +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement); + +#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h new file mode 100644 index 000000000..2ea39c2d0 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h @@ -0,0 +1,19 @@ +#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ +#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ + +/* + * List of exported functions in this library + */ + +// TopologicalValueIteration +#include "srcCuda/basicValueIteration.h" + +// Utility Functions +#include "srcCuda/utility.h" + +// Version Information +#include "srcCuda/version.h" + + + +#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h new file mode 100644 index 000000000..11c673bf9 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -0,0 +1,49 @@ +#pragma once + +#include "cuspExtensionFloat.h" +#include "cuspExtensionDouble.h" + +namespace cusp { +namespace detail { +namespace device { + +template +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { + // + throw; +} +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { + storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { + storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +template +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { + // + throw; +} +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); +} +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); +} + +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); +} +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h new file mode 100644 index 000000000..901df0ae7 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -0,0 +1,361 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) +{ + __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + double sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) +{ + __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + double localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + double __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const double minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_double <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_double <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h new file mode 100644 index 000000000..bb9acf78e --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h @@ -0,0 +1,375 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & paste, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +#include "storm-cudaplugin-config.h" + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) +{ + __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + float sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) +{ + __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + float localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + float __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const float minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_float <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_float <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu new file mode 100644 index 000000000..2be10e8ca --- /dev/null +++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.cu @@ -0,0 +1,39 @@ +#include +#include + +__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { + *B = *A; +} + +void kernelSwitchTest(size_t N) { + int* deviceIntA; + int* deviceIntB; + + if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { + std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; + return; + } + + // Allocate space on the device + auto start_time = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N; ++i) { + cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); + } + auto end_time = std::chrono::high_resolution_clock::now(); + std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast(end_time - start_time).count() << "micros" << std::endl; + std::cout << "Resulting in " << (std::chrono::duration_cast(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; + + // Free memory on device + if (cudaFree(deviceIntA) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } + if (cudaFree(deviceIntB) != cudaSuccess) { + std::cout << "Error in cudaFree!" << std::endl; + return; + } +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/kernelSwitchTest.h b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h new file mode 100644 index 000000000..dff8a13ff --- /dev/null +++ b/resources/cudaForStorm/srcCuda/kernelSwitchTest.h @@ -0,0 +1 @@ +void kernelSwitchTest(size_t N); \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.cu b/resources/cudaForStorm/srcCuda/utility.cu new file mode 100644 index 000000000..99165ba07 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/utility.cu @@ -0,0 +1,33 @@ +#include "utility.h" + +#include + +size_t getFreeCudaMemory() { + size_t freeMemory; + size_t totalMemory; + cudaMemGetInfo(&freeMemory, &totalMemory); + + return freeMemory; +} + +size_t getTotalCudaMemory() { + size_t freeMemory; + size_t totalMemory; + cudaMemGetInfo(&freeMemory, &totalMemory); + + return totalMemory; +} + +bool resetCudaDevice() { + cudaError_t result = cudaDeviceReset(); + return (result == cudaSuccess); +} + +int getRuntimeCudaVersion() { + int result = -1; + cudaError_t errorResult = cudaRuntimeGetVersion(&result); + if (errorResult != cudaSuccess) { + return -1; + } + return result; +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.h b/resources/cudaForStorm/srcCuda/utility.h new file mode 100644 index 000000000..f3110fbeb --- /dev/null +++ b/resources/cudaForStorm/srcCuda/utility.h @@ -0,0 +1,12 @@ +#ifndef STORM_CUDAFORSTORM_UTILITY_H_ +#define STORM_CUDAFORSTORM_UTILITY_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +cudaForStorm_EXPORT size_t getFreeCudaMemory(); +cudaForStorm_EXPORT size_t getTotalCudaMemory(); +cudaForStorm_EXPORT bool resetCudaDevice(); +cudaForStorm_EXPORT int getRuntimeCudaVersion(); + +#endif // STORM_CUDAFORSTORM_UTILITY_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/version.cu b/resources/cudaForStorm/srcCuda/version.cu new file mode 100644 index 000000000..3850c895c --- /dev/null +++ b/resources/cudaForStorm/srcCuda/version.cu @@ -0,0 +1,28 @@ +#include "version.h" + +#include "storm-cudaplugin-config.h" + +size_t getStormCudaPluginVersionMajor() { + return STORM_CUDAPLUGIN_VERSION_MAJOR; +} + +size_t getStormCudaPluginVersionMinor() { + return STORM_CUDAPLUGIN_VERSION_MINOR; +} + +size_t getStormCudaPluginVersionPatch() { + return STORM_CUDAPLUGIN_VERSION_PATCH; +} + +size_t getStormCudaPluginVersionCommitsAhead() { + return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD; +} + +const char* getStormCudaPluginVersionHash() { + static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH; + return versionHash.c_str(); +} + +bool getStormCudaPluginVersionIsDirty() { + return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0); +} \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/version.h b/resources/cudaForStorm/srcCuda/version.h new file mode 100644 index 000000000..de3f4f16c --- /dev/null +++ b/resources/cudaForStorm/srcCuda/version.h @@ -0,0 +1,16 @@ +#ifndef STORM_CUDAFORSTORM_VERSION_H_ +#define STORM_CUDAFORSTORM_VERSION_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +#include + +cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch(); +cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead(); +cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash(); +cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty(); + +#endif // STORM_CUDAFORSTORM_VERSION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in new file mode 100644 index 000000000..3703d0c81 --- /dev/null +++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in @@ -0,0 +1,21 @@ +/* + * StoRM - Build-in Options + * + * This file is parsed by CMake during makefile generation + */ + +#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ +#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ + +// Version Information +#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@ // The major version of StoRM +#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@ // The minor version of StoRM +#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@ // The patch version of StoRM +#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ // How many commits passed since the tag was last set +#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on +#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else + +// Whether the size of float in a pair is expanded to 64bit +#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + +#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ diff --git a/src/counterexamples/PathBasedSubsystemGenerator.h b/src/counterexamples/PathBasedSubsystemGenerator.h index d4204a18c..5c5ff4159 100644 --- a/src/counterexamples/PathBasedSubsystemGenerator.h +++ b/src/counterexamples/PathBasedSubsystemGenerator.h @@ -430,7 +430,7 @@ public: allowedStates = storm::storage::BitVector(targetStates.size(), true); } else if(globally.get() != nullptr){ - //eventually reaching a state without property visiting only states with property + // eventually reaching a state without property visiting only states with property allowedStates = globally->getChild()->check(modelCheck); targetStates = storm::storage::BitVector(allowedStates); targetStates.complement(); @@ -451,9 +451,9 @@ public: // estimate the path count using the models state count as well as the probability bound uint_fast8_t const minPrec = 10; uint_fast64_t const stateCount = model.getNumberOfStates(); - uint_fast64_t const stateEstimate = static_cast(stateCount * bound); + uint_fast64_t const stateEstimate = static_cast((static_cast(stateCount)) * bound); - //since this only has a good effect on big models -> use only if model has at least 10^5 states + // since this only has a good effect on big models -> use only if model has at least 10^5 states uint_fast64_t precision = stateEstimate > 100000 ? stateEstimate/1000 : minPrec; @@ -546,11 +546,11 @@ public: //std::cout << "Diff: " << diff << std::endl; //std::cout << "Path count: " << pathCount << std::endl; - //Are we critical? + // Are we critical? if(subSysProb >= bound){ break; } else if (stateEstimate > 100000){ - precision = static_cast((stateEstimate / 1000.0) - ((stateEstimate / 1000.0) - minPrec)*(subSysProb/bound)); + precision = static_cast((stateEstimate / 1000.0) - ((stateEstimate / 1000.0) - minPrec) * (subSysProb/bound)); } } } diff --git a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h index 2a584a4d0..7b9c58d52 100644 --- a/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h +++ b/src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h @@ -9,6 +9,7 @@ #define STORM_MODELCHECKER_PRCTL_TOPOLOGICALVALUEITERATIONSMDPPRCTLMODELCHECKER_H_ #include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include "src/exceptions/InvalidPropertyException.h" #include @@ -29,7 +30,7 @@ public: * * @param model The MDP to be checked. */ - explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp const& model) : SparseMdpPrctlModelChecker(model) { + explicit TopologicalValueIterationMdpPrctlModelChecker(storm::models::Mdp const& model) : SparseMdpPrctlModelChecker(model, std::shared_ptr>(new storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver())) { // Intentionally left empty. } @@ -37,8 +38,8 @@ public: * Copy constructs a SparseMdpPrctlModelChecker from the given model checker. In particular, this means that the newly * constructed model checker will have the model of the given model checker as its associated model. */ - explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) - : SparseMdpPrctlModelChecker(modelchecker), minimumOperatorStack() { + explicit TopologicalValueIterationMdpPrctlModelChecker(storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker const& modelchecker) + : SparseMdpPrctlModelChecker(modelchecker) { // Intentionally left empty. } @@ -46,100 +47,6 @@ public: * Virtual destructor. Needs to be virtual, because this class has virtual methods. */ virtual ~TopologicalValueIterationMdpPrctlModelChecker() { } - -private: - /*! - * Solves the given equation system under the given parameters using the power method. - * - * @param A The matrix A specifying the coefficients of the equations. - * @param x The vector x for which to solve the equations. The initial value of the elements of - * this vector are used as the initial guess and might thus influence performance and convergence. - * @param b The vector b specifying the values on the right-hand-sides of the equations. - * @return The solution of the system of linear equations in form of the elements of the vector - * x. - */ - void solveEquationSystem(storm::storage::SparseMatrix const& matrix, std::vector& x, std::vector const& b) const { - // Get the settings object to customize solving. - storm::settings::Settings* s = storm::settings::Settings::getInstance(); - - // Get relevant user-defined settings for solving the equations. - double precision = s->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); - uint_fast64_t maxIterations = s->getOptionByLongName("maxIterations").getArgument(0).getValueAsUnsignedInteger(); - bool relative = s->getOptionByLongName("relative").getArgument(0).getValueAsBoolean(); - - // Now, we need to determine the SCCs of the MDP and a topological sort. - std::vector> stronglyConnectedComponents = storm::utility::graph::performSccDecomposition(this->getModel(), stronglyConnectedComponents, stronglyConnectedComponentsDependencyGraph); - storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = this->getModel().extractSccDependencyGraph(stronglyConnectedComponents); - std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); - - // Set up the environment for the power method. - std::vector multiplyResult(matrix.getRowCount()); - std::vector* currentX = &x; - std::vector* newX = new std::vector(x.size()); - std::vector* swap = nullptr; - uint_fast64_t currentMaxLocalIterations = 0; - uint_fast64_t localIterations = 0; - uint_fast64_t globalIterations = 0; - bool converged = true; - - // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only - // solved after all SCCs it depends on have been solved. - for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { - std::vector const& scc = stronglyConnectedComponents[*sccIndexIt]; - - // For the current SCC, we need to perform value iteration until convergence. - localIterations = 0; - converged = false; - while (!converged && localIterations < maxIterations) { - // Compute x' = A*x + b. - matrix.multiplyWithVector(scc, *currentX, multiplyResult); - storm::utility::addVectors(scc, matrix.getRowGroupIndices(), multiplyResult, b); - - // Reduce the vector x' by applying min/max for all non-deterministic choices. - if (this->minimumOperatorStack.top()) { - storm::utility::reduceVectorMin(multiplyResult, newX, scc, matrix.getRowGroupIndices()); - } else { - storm::utility::reduceVectorMax(multiplyResult, newX, scc, matrix.getRowGroupIndices()); - } - - // Determine whether the method converged. - // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher - // running time. In fact, it is faster. This has to be investigated. - // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); - converged = storm::utility::equalModuloPrecision(*currentX, *newX, precision, relative); - - // Update environment variables. - swap = currentX; - currentX = newX; - newX = swap; - ++localIterations; - ++globalIterations; - } - - // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep - // track of the maximum. - if (localIterations > currentMaxLocalIterations) { - currentMaxLocalIterations = localIterations; - } - } - - // If we performed an odd number of global iterations, we need to swap the x and currentX, because the newest - // result is currently stored in currentX, but x is the output vector. - // TODO: Check whether this is correct or should be put into the for-loop over SCCs. - if (globalIterations % 2 == 1) { - std::swap(x, *currentX); - delete currentX; - } else { - delete newX; - } - - // Check if the solver converged and issue a warning otherwise. - if (converged) { - LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); - } else { - LOG4CPLUS_WARN(logger, "Iterative solver did not converge."); - } - } }; } // namespace prctl diff --git a/src/models/PseudoModel.cpp b/src/models/PseudoModel.cpp new file mode 100644 index 000000000..af446778c --- /dev/null +++ b/src/models/PseudoModel.cpp @@ -0,0 +1,111 @@ +#include "src/models/PseudoModel.h" +#include "src/utility/constants.h" +#include "src/models/AbstractModel.h" + +namespace storm { + namespace models { + + template + ModelBasedPseudoModel::ModelBasedPseudoModel(storm::models::AbstractModel const& model) : _model(model) { + // Intentionally left empty. + } + + template + NonDeterministicMatrixBasedPseudoModel::NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix, std::vector const& nondeterministicChoiceIndices) : _matrix(matrix), _nondeterministicChoiceIndices(nondeterministicChoiceIndices) { + // Intentionally left empty. + } + + template + DeterministicMatrixBasedPseudoModel::DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix) : _matrix(matrix) { + // Intentionally left empty. + } + + template + typename storm::storage::SparseMatrix::const_rows + ModelBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_model.getRows(state); + } + + template + typename storm::storage::SparseMatrix::const_rows + NonDeterministicMatrixBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_matrix.getRows(this->_nondeterministicChoiceIndices[state], this->_nondeterministicChoiceIndices[state + 1] - 1); + } + + template + typename storm::storage::SparseMatrix::const_rows + DeterministicMatrixBasedPseudoModel::getRows(uint_fast64_t state) const { + return this->_matrix.getRows(state, state); + } + + template + uint_fast64_t + ModelBasedPseudoModel::getNumberOfStates() const { + return this->_model.getNumberOfStates(); + } + + template + uint_fast64_t + NonDeterministicMatrixBasedPseudoModel::getNumberOfStates() const { + return this->_matrix.getColumnCount(); + } + + template + uint_fast64_t + DeterministicMatrixBasedPseudoModel::getNumberOfStates() const { + return this->_matrix.getColumnCount(); + } + + template + storm::storage::SparseMatrix + AbstractPseudoModel::extractPartitionDependencyGraph(storm::storage::Decomposition const& decomposition) const { + uint_fast64_t numberOfStates = decomposition.size(); + + // First, we need to create a mapping of states to their SCC index, to ease the computation of dependency transitions later. + std::vector stateToBlockMap(this->getNumberOfStates()); + for (uint_fast64_t i = 0; i < decomposition.size(); ++i) { + for (auto state : decomposition[i]) { + stateToBlockMap[state] = i; + } + } + + // The resulting sparse matrix will have as many rows/columns as there are blocks in the partition. + storm::storage::SparseMatrixBuilder dependencyGraphBuilder(numberOfStates, numberOfStates); + + for (uint_fast64_t currentBlockIndex = 0; currentBlockIndex < decomposition.size(); ++currentBlockIndex) { + // Get the next block. + typename storm::storage::StateBlock const& block = decomposition[currentBlockIndex]; + + // Now, we determine the blocks which are reachable (in one step) from the current block. + boost::container::flat_set allTargetBlocks; + for (auto state : block) { + for (auto const& transitionEntry : this->getRows(state)) { + uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.getColumn()]; + + // We only need to consider transitions that are actually leaving the SCC. + if (targetBlock != currentBlockIndex) { + allTargetBlocks.insert(targetBlock); + } + } + } + + // Now we can just enumerate all the target SCCs and insert the corresponding transitions. + for (auto targetBlock : allTargetBlocks) { + dependencyGraphBuilder.addNextValue(currentBlockIndex, targetBlock, storm::utility::constantOne()); + } + } + + return dependencyGraphBuilder.build(); + } + + template class ModelBasedPseudoModel; + template class NonDeterministicMatrixBasedPseudoModel; + template class DeterministicMatrixBasedPseudoModel; + template class ModelBasedPseudoModel ; + template class NonDeterministicMatrixBasedPseudoModel ; + template class DeterministicMatrixBasedPseudoModel ; + template class ModelBasedPseudoModel; + template class NonDeterministicMatrixBasedPseudoModel; + template class DeterministicMatrixBasedPseudoModel; + } // namespace models +} // namespace storm \ No newline at end of file diff --git a/src/models/PseudoModel.h b/src/models/PseudoModel.h new file mode 100644 index 000000000..6d3fd38da --- /dev/null +++ b/src/models/PseudoModel.h @@ -0,0 +1,90 @@ +#ifndef STORM_MODELS_PSEUDOMODEL_H_ +#define STORM_MODELS_PSEUDOMODEL_H_ + +#include +#include "src/storage/SparseMatrix.h" +#include "src/storage/Decomposition.h" + +namespace storm { + namespace models { + // Forward declare the abstract model class. + template class AbstractModel; + + /*! + * This classes encapsulate the model/transitionmatrix on which the SCC decomposition is performed. + * The Abstract Base class is specialized by the two possible representations: + * - For a model the implementation ModelBasedPseudoModel hands the call to getRows() through to the model + * - For a matrix of a nondeterministic model the implementation NonDeterministicMatrixBasedPseudoModel emulates the call + * on the matrix itself like the model function would + * - For a matrix of a deterministic model the implementation DeterministicMatrixBasedPseudoModel emulates the call + * on the matrix itself like the model function would + */ + template + class AbstractPseudoModel { + public: + AbstractPseudoModel() {} + virtual ~AbstractPseudoModel() {} + + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const = 0; + + /*! + * Calculates the number of states in the represented system. + * @return The Number of States in the underlying model/transition matrix + */ + virtual uint_fast64_t getNumberOfStates() const = 0; + + /*! + * Extracts the dependency graph from a (pseudo-) model according to the given partition. + * + * @param decomposition A decomposition containing the blocks of the partition of the system. + * @return A sparse matrix with bool entries that represents the dependency graph of the blocks of the partition. + */ + virtual storm::storage::SparseMatrix extractPartitionDependencyGraph(storm::storage::Decomposition const& decomposition) const; + }; + + template + class ModelBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a model + * @param model The Model on which the decomposition is to be performed + */ + ModelBasedPseudoModel(storm::models::AbstractModel const& model); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::models::AbstractModel const& _model; + }; + + template + class NonDeterministicMatrixBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a matrix + * @param matrix The Matrix on which the decomposition is to be performed + */ + NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix, std::vector const& nondeterministicChoiceIndices); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::storage::SparseMatrix const& _matrix; + std::vector const& _nondeterministicChoiceIndices; + }; + + template + class DeterministicMatrixBasedPseudoModel : public AbstractPseudoModel { + public: + /*! + * Creates an encapsulation for the SCC decomposition based on a matrix + * @param matrix The Matrix on which the decomposition is to be performed + */ + DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix const& matrix); + virtual typename storm::storage::SparseMatrix::const_rows getRows(uint_fast64_t state) const override; + virtual uint_fast64_t getNumberOfStates() const override; + private: + storm::storage::SparseMatrix const& _matrix; + }; + } +} + +#endif // STORM_MODELS_PSEUDOMODEL_H_ \ No newline at end of file diff --git a/src/solver/NativeNondeterministicLinearEquationSolver.cpp b/src/solver/NativeNondeterministicLinearEquationSolver.cpp index 04867cab3..9d43130b0 100644 --- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp +++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp @@ -66,7 +66,7 @@ namespace storm { } // Determine whether the method converged. - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); // Update environment variables. std::swap(currentX, newX); @@ -130,5 +130,6 @@ namespace storm { // Explicitly instantiate the solver. template class NativeNondeterministicLinearEquationSolver; + template class NativeNondeterministicLinearEquationSolver; } // namespace solver } // namespace storm diff --git a/src/solver/NativeNondeterministicLinearEquationSolver.h b/src/solver/NativeNondeterministicLinearEquationSolver.h index 69542a166..60dd09149 100644 --- a/src/solver/NativeNondeterministicLinearEquationSolver.h +++ b/src/solver/NativeNondeterministicLinearEquationSolver.h @@ -34,7 +34,7 @@ namespace storm { virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; - private: + protected: // The required precision for the iterative methods. double precision; diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp new file mode 100644 index 000000000..c8fbf9a4e --- /dev/null +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -0,0 +1,467 @@ +#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" + +#include +#include + +#include "src/settings/Settings.h" +#include "src/utility/vector.h" +#include "src/utility/graph.h" +#include "src/models/PseudoModel.h" +#include "src/storage/StronglyConnectedComponentDecomposition.h" +#include "src/exceptions/IllegalArgumentException.h" +#include "src/exceptions/InvalidStateException.h" + +#include "log4cplus/logger.h" +#include "log4cplus/loggingmacros.h" +extern log4cplus::Logger logger; + +#include "storm-config.h" +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif + +namespace storm { + namespace solver { + + template + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver() { + // Get the settings object to customize solving. + storm::settings::Settings* settings = storm::settings::Settings::getInstance(); + + // Get appropriate settings. + this->maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); + this->precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); + this->relative = !settings->isSet("absolute"); + } + + template + TopologicalValueIterationNondeterministicLinearEquationSolver::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver(precision, maximalNumberOfIterations, relative) { + // Intentionally left empty. + } + + template + NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { + return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); + } + + template + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + +#ifdef GPU_USE_FLOAT +#define __FORCE_FLOAT_CALCULATION true +#else +#define __FORCE_FLOAT_CALCULATION false +#endif + if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) { + TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); + + storm::storage::SparseMatrix new_A = A.toValueType(); + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); + + tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); + + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); + } + return; + } + + // For testing only + if (sizeof(ValueType) == sizeof(double)) { + std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); + } else { + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); + } + + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); + storm::models::NonDeterministicMatrixBasedPseudoModel const pseudoModel(A, nondeterministicChoiceIndices); + + // Check if the decomposition is necessary +#ifdef STORM_HAVE_CUDAFORSTORM +#define __USE_CUDAFORSTORM_OPT true + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast(A.getEntryCount())); + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); +#else +#define __USE_CUDAFORSTORM_OPT false + size_t const gpuSizeOfCompleteSystem = 0; + size_t const cudaFreeMemory = 0; +#endif + std::vector> sccDecomposition; + if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { + // Dummy output for SCC Times + std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; + +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } + + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + bool result = false; + size_t globalIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + bool converged = false; + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; + + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl; + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations."); + } +#else + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now(); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } + + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast(sccEndTime - sccStartTime).count() << "ms." << std::endl; + + std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + int counter = 0; + + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + typename storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } + } + ++innerIndex; + } + ++outerIndex; + } + + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } + + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + + bool result = false; + localIterations = 0; + if (minimize) { + result = __basicValueIteration_mvReduce_uint64_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = __basicValueIteration_mvReduce_uint64_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + globalIterations += localIterations; +#else + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + + // Update environment variables. + std::swap(currentX, swap); + + ++localIterations; + ++globalIterations; + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + } + + + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; + } + + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + } + + std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + } + + std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); + std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast(calcEndTime - calcStartTime).count() << "ms." << std::endl; + } + } + + template + std::vector> + TopologicalValueIterationNondeterministicLinearEquationSolver::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const { + std::vector> result; +#ifdef STORM_HAVE_CUDAFORSTORM + // 95% to have a bit of padding + size_t const cudaFreeMemory = static_cast(getFreeCudaMemory() * 0.95); + size_t lastResultIndex = 0; + + std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); + size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); + size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; + + size_t currentSize = 0; + size_t neededReserveSize = 0; + size_t startIndex = 0; + for (size_t i = 0; i < topologicalSort.size(); ++i) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; + size_t const currentSccSize = scc.size(); + + uint_fast64_t rowCount = 0; + uint_fast64_t entryCount = 0; + + for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { + rowCount += matrix.getRowGroupSize(*sccIt); + entryCount += matrix.getRowGroupEntryCount(*sccIt); + } + + size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); + + if ((currentSize + sccSize) <= cudaFreeMemory) { + // There is enough space left in the current group + neededReserveSize += currentSccSize; + currentSize += sccSize; + } else { + // This would make the last open group to big for the GPU + + if (startIndex < i) { + if ((startIndex + 1) < i) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + + // Copy the first group to make inplace_merge possible + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + if (((startIndex + 1) + 80) >= i) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < i; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + } + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); + } else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; + } + + if (sccSize <= cudaFreeMemory) { + currentSize = sccSize; + neededReserveSize = currentSccSize; + startIndex = i; + } else { + // This group is too big to fit into the CUDA Memory by itself + result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); + ++lastResultIndex; + + currentSize = 0; + neededReserveSize = 0; + startIndex = i + 1; + } + } + } + + size_t const topologicalSortSize = topologicalSort.size(); + if (startIndex < topologicalSortSize) { + if ((startIndex + 1) < topologicalSortSize) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + + // Copy the first group to make inplace_merge possible + storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; + tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); + + // For set counts <= 80, Inplace Merge is faster + if (((startIndex + 1) + 80) >= topologicalSortSize) { + size_t lastSize = 0; + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + lastSize = tempGroups.size(); + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + std::vector::iterator middleIterator = tempGroups.begin(); + std::advance(middleIterator, lastSize); + std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); + } + } else { + // Use std::sort + for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + } + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); + } + else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; + } +#else + for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { + storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + result.push_back(std::make_pair(false, scc)); + } +#endif + return result; + } + + // Explicitly instantiate the solver. + template class TopologicalValueIterationNondeterministicLinearEquationSolver; + template class TopologicalValueIterationNondeterministicLinearEquationSolver; + } // namespace solver +} // namespace storm diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h new file mode 100644 index 000000000..bdbf788cd --- /dev/null +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h @@ -0,0 +1,97 @@ +#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ +#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ + +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/storage/StronglyConnectedComponentDecomposition.h" +#include "src/storage/SparseMatrix.h" + +#include +#include + +#include "storm-config.h" +#ifdef STORM_HAVE_CUDAFORSTORM +# include "cudaForStorm.h" +#endif + +namespace storm { + namespace solver { + + /*! + * A class that uses SCC Decompositions to solve a linear equation system + */ + template + class TopologicalValueIterationNondeterministicLinearEquationSolver : public NativeNondeterministicLinearEquationSolver { + public: + /*! + * Constructs a nondeterministic linear equation solver with parameters being set according to the settings + * object. + */ + TopologicalValueIterationNondeterministicLinearEquationSolver(); + + /*! + * Constructs a nondeterminstic linear equation solver with the given parameters. + * + * @param precision The precision to use for convergence detection. + * @param maximalNumberOfIterations The maximal number of iterations do perform before iteration is aborted. + * @param relative If set, the relative error rather than the absolute error is considered for convergence + * detection. + */ + TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative = true); + + virtual NondeterministicLinearEquationSolver* clone() const override; + + virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult = nullptr, std::vector* newX = nullptr) const override; + private: + /*! + * Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. + */ + std::vector> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition const& sccDecomposition, std::vector const& topologicalSort, storm::storage::SparseMatrix const& matrix) const; + }; + + template + bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + + template + bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + // + throw; + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + template <> + inline bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +#ifdef STORM_HAVE_CUDAFORSTORM + return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); +#else + throw; +#endif + } + } // namespace solver +} // namespace storm + +#endif /* STORM_SOLVER_NATIVENONDETERMINISTICLINEAREQUATIONSOLVER_H_ */ diff --git a/src/utility/graph.h b/src/utility/graph.h index b41883d7c..be2d2e9df 100644 --- a/src/utility/graph.h +++ b/src/utility/graph.h @@ -128,7 +128,7 @@ namespace storm { uint_fast64_t numberOfStates = phiStates.size(); storm::storage::BitVector statesWithProbabilityGreater0(numberOfStates); - // Add all psi states as the already satisfy the condition. + // Add all psi states as they already satisfy the condition. statesWithProbabilityGreater0 |= psiStates; // Initialize the stack used for the DFS with the states. @@ -667,7 +667,7 @@ namespace storm { LOG4CPLUS_ERROR(logger, "Provided matrix is required to be square."); throw storm::exceptions::InvalidArgumentException() << "Provided matrix is required to be square."; } - + uint_fast64_t numberOfStates = matrix.getRowCount(); // Prepare the result. This relies on the matrix being square. @@ -696,12 +696,12 @@ namespace storm { recursionStepBackward: for (; successorIterator != matrix.end(currentState); ++successorIterator) { - if (!visitedStates.get(successorIterator.first)) { + if (!visitedStates.get(successorIterator->getColumn())) { // Put unvisited successor on top of our recursion stack and remember that. - recursionStack.push_back(successorIterator.first); + recursionStack.push_back(successorIterator->getColumn()); // Also, put initial value for iterator on corresponding recursion stack. - iteratorRecursionStack.push_back(matrix.begin(successorIterator.first)); + iteratorRecursionStack.push_back(matrix.begin(successorIterator->getColumn())); goto recursionStepForward; } diff --git a/src/utility/vector.h b/src/utility/vector.h index 811e561be..1547903fa 100644 --- a/src/utility/vector.h +++ b/src/utility/vector.h @@ -329,7 +329,10 @@ namespace storm { template bool equalModuloPrecision(T const& val1, T const& val2, T precision, bool relativeError = true) { if (relativeError) { - if (std::abs(val1 - val2)/val2 > precision) return false; + if (val2 == 0) { + return (std::abs(val1) <= precision); + } + if (std::abs((val1 - val2)/val2) > precision) return false; } else { if (std::abs(val1 - val2) > precision) return false; } @@ -419,6 +422,20 @@ namespace storm { return subVector; } + + /*! + * Converts the given vector to the given ValueType + */ + template + std::vector toValueType(std::vector const& oldVector) { + std::vector resultVector; + resultVector.resize(oldVector.size()); + for (size_t i = 0, size = oldVector.size(); i < size; ++i) { + resultVector.at(i) = static_cast(oldVector.at(i)); + } + + return resultVector; + } } // namespace vector } // namespace utility } // namespace storm diff --git a/storm-config.h.in b/storm-config.h.in index 40db93554..741bc6939 100644 --- a/storm-config.h.in +++ b/storm-config.h.in @@ -23,6 +23,9 @@ // Whether GLPK is available and to be used (define/undef) #@STORM_CPP_GLPK_DEF@ STORM_HAVE_GLPK +// Whether CudaForStorm is available and to be used (define/undef) +#@STORM_CPP_CUDAFORSTORM_DEF@ STORM_HAVE_CUDAFORSTORM + // Whether Z3 is available and to be used (define/undef) #@STORM_CPP_Z3_DEF@ STORM_HAVE_Z3 diff --git a/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp new file mode 100644 index 000000000..e125b09a2 --- /dev/null +++ b/test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -0,0 +1,240 @@ +#include "gtest/gtest.h" +#include "storm-config.h" + +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/settings/Settings.h" +#include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" +#include "src/parser/AutoParser.h" + +#include "storm-config.h" + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 169ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("two"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("two"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("three"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("three"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("four"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("four"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("done"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula);; + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + std::shared_ptr> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as>(); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateRewardModelChecker(*stateRewardMdp); + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + std::shared_ptr> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as>(); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 14.6666581), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("done"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 14.666663), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; +} + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew")->as>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 3172ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 4.285701547), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula); + +#ifdef STORM_HAVE_CUDAFORSTORM + ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#else + ASSERT_LT(std::abs(result[0] - 4.285703591), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); +#endif + delete rewardFormula; +} diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp new file mode 100644 index 000000000..da52f3e2b --- /dev/null +++ b/test/functional/solver/CudaPluginTest.cpp @@ -0,0 +1,354 @@ +#include "gtest/gtest.h" +#include "src/storage/SparseMatrix.h" +#include "src/exceptions/InvalidStateException.h" +#include "src/exceptions/OutOfRangeException.h" + +#include "storm-config.h" + +#ifdef STORM_HAVE_CUDAFORSTORM + +#include "cudaForStorm.h" + +TEST(CudaPlugin, SpMV_4x4) { + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0)); + + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(10, matrix.getEntryCount()); + + std::vector x({0, 4, 1, 1}); + std::vector b({0, 0, 0, 0}); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 3); + ASSERT_EQ(b.at(1), 25); + ASSERT_EQ(b.at(2), 16); + ASSERT_EQ(b.at(3), 0); +} + +TEST(CudaPlugin, SpMV_4x4_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f)); + + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(10, matrix.getEntryCount()); + + std::vector x({ 0.f, 4.f, 1.f, 1.f }); + std::vector b({ 0.f, 0.f, 0.f, 0.f }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 3); + ASSERT_EQ(b.at(1), 25); + ASSERT_EQ(b.at(2), 16); + ASSERT_EQ(b.at(3), 0); +} + +TEST(CudaPlugin, SpMV_VerySmall) { + storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(2, matrix.getRowCount()); + ASSERT_EQ(2, matrix.getColumnCount()); + ASSERT_EQ(2, matrix.getEntryCount()); + + std::vector x({ 4.0, 8.0 }); + std::vector b({ 0.0, 0.0 }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 4.0); + ASSERT_EQ(b.at(1), 16.0); +} + +TEST(CudaPlugin, SpMV_VerySmall_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(2, matrix.getRowCount()); + ASSERT_EQ(2, matrix.getColumnCount()); + ASSERT_EQ(2, matrix.getEntryCount()); + + std::vector x({ 4.0, 8.0 }); + std::vector b({ 0.0, 0.0 }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 4.0); + ASSERT_EQ(b.at(1), 16.0); +} + +TEST(CudaPlugin, AddVectorsInplace) { + std::vector vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + double cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + double cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + +TEST(CudaPlugin, AddVectorsInplace_float) { + std::vector vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + float cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + float cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + +TEST(CudaPlugin, ReduceGroupedVector) { + std::vector groupedVector = { + 0.0, -1000.0, 0.000004, // Group 0 + 5.0, // Group 1 + 0.0, 1.0, 2.0, 3.0, // Group 2 + -1000.0, -3.14, -0.0002,// Group 3 (neg only) + 25.25, 25.25, 25.25, // Group 4 + 0.0, 0.0, 1.0, // Group 5 + -0.000001, 0.000001 // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0, // Group 0 + 5.0, + 0.0, + -1000.0, + 25.25, + 0.0, + -0.000001 + }; + std::vector result_maximize = { + 0.000004, + 5.0, + 3.0, + -0.0002, + 25.25, + 1.0, + 0.000001 + }; + + std::vector result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + +TEST(CudaPlugin, ReduceGroupedVector_float) { + std::vector groupedVector = { + 0.0f, -1000.0f, 0.000004f, // Group 0 + 5.0f, // Group 1 + 0.0f, 1.0f, 2.0f, 3.0f, // Group 2 + -1000.0f, -3.14f, -0.0002f,// Group 3 (neg only) + 25.25f, 25.25f, 25.25f, // Group 4 + 0.0f, 0.0f, 1.0f, // Group 5 + -0.000001f, 0.000001f // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0f, // Group 0 + 5.0f, + 0.0f, + -1000.0f, + 25.25f, + 0.0f, + -0.000001f + }; + std::vector result_maximize = { + 0.000004f, + 5.0f, + 3.0f, + -0.0002f, + 25.25f, + 1.0f, + 0.000001f + }; + + std::vector result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + +TEST(CudaPlugin, equalModuloPrecision) { + std::vector x = { + 123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12 + }; + std::vector y1 = { + 0.45, 0.8, 0.23, 0.012, 0.456789, -0.12 + }; + std::vector y2 = { + 0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12 + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0); + x3.push_back(-(1000.0 - static_cast(i))); + y4.push_back(1.0); + } + + double maxElement1 = 0.0; + double maxElement2 = 0.0; + double maxElement3 = 0.0; + double maxElement4 = 0.0; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0, maxElement1); + ASSERT_DOUBLE_EQ(901.0, maxElement2); + + ASSERT_DOUBLE_EQ(998.0, maxElement3); + ASSERT_DOUBLE_EQ(1001.0, maxElement4); +} + +TEST(CudaPlugin, equalModuloPrecision_float) { + std::vector x = { + 123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f + }; + std::vector y1 = { + 0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f + }; + std::vector y2 = { + 0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0f); + x3.push_back(-(1000.0f - static_cast(i))); + y4.push_back(1.0f); + } + + float maxElement1 = 0.0f; + float maxElement2 = 0.0f; + float maxElement3 = 0.0f; + float maxElement4 = 0.0f; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0f, maxElement1); + ASSERT_DOUBLE_EQ(901.0f, maxElement2); + + ASSERT_DOUBLE_EQ(998.0f, maxElement3); + ASSERT_DOUBLE_EQ(1001.0f, maxElement4); +} + +#endif \ No newline at end of file diff --git a/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp new file mode 100644 index 000000000..441c4a3bb --- /dev/null +++ b/test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp @@ -0,0 +1,163 @@ +#include "gtest/gtest.h" +#include "storm-config.h" + +#include "src/settings/Settings.h" +#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h" +#include "src/solver/NativeNondeterministicLinearEquationSolver.h" +#include "src/parser/AutoParser.h" + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew")->as>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 2095783ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 7714385ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 25); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 6.172433512), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("elected"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[0] - 6.1724344), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} + +TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Consensus) { + storm::settings::Settings* s = storm::settings::Settings::getInstance(); + // Increase the maximal number of iterations, because the solver does not converge otherwise. + // This is done in the main cpp unit + + std::shared_ptr> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", "")->as>(); + + ASSERT_EQ(mdp->getNumberOfStates(), 63616ull); + ASSERT_EQ(mdp->getNumberOfTransitions(), 213472ull); + + storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker mc(*mdp); + + storm::property::prctl::Ap* apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::Eventually* eventuallyFormula = new storm::property::prctl::Eventually(apFormula); + storm::property::prctl::ProbabilisticNoBoundOperator* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + std::vector result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::Ap* apFormula2 = new storm::property::prctl::Ap("all_coins_equal_0"); + storm::property::prctl::And* andFormula = new storm::property::prctl::And(apFormula, apFormula2); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.4374282832), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + apFormula2 = new storm::property::prctl::Ap("all_coins_equal_1"); + andFormula = new storm::property::prctl::And(apFormula, apFormula2); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.5293286369), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + apFormula2 = new storm::property::prctl::Ap("agree"); + storm::property::prctl::Not* notFormula = new storm::property::prctl::Not(apFormula2); + andFormula = new storm::property::prctl::And(apFormula, notFormula); + eventuallyFormula = new storm::property::prctl::Eventually(andFormula); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(eventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.10414097), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::BoundedEventually* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 50ull); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, true); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + boundedEventuallyFormula = new storm::property::prctl::BoundedEventually(apFormula, 50ull); + probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator(boundedEventuallyFormula, false); + + result = mc.checkNoBoundOperator(*probFormula); + + ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete probFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + storm::property::prctl::ReachabilityReward* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + storm::property::prctl::RewardNoBoundOperator* rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, true); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[31168] - 1725.593313), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; + + apFormula = new storm::property::prctl::Ap("finished"); + reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward(apFormula); + rewardFormula = new storm::property::prctl::RewardNoBoundOperator(reachabilityRewardFormula, false); + + result = mc.checkNoBoundOperator(*rewardFormula); + + ASSERT_LT(std::abs(result[31168] - 2183.142422), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); + delete rewardFormula; +} \ No newline at end of file