Browse Source
Merge branch 'philippTopologicalRevival' into cuda_integration
Merge branch 'philippTopologicalRevival' into cuda_integration
Conflicts:
src/storage/Decomposition.h
src/storage/SparseMatrix.cpp
src/storage/SparseMatrix.h
src/storage/StronglyConnectedComponentDecomposition.cpp
src/storage/StronglyConnectedComponentDecomposition.h
src/storm.cpp
test/functional/storage/StronglyConnectedComponentDecompositionTest.cpp
Former-commit-id: 27e660a295
main
41 changed files with 4329 additions and 110 deletions
-
3.gitmodules
-
25CMakeLists.txt
-
2examples/mdp/scc/scc.pctl
-
1resources/3rdparty/cusplibrary
-
56resources/cmake/FindCusp.cmake
-
52resources/cmake/FindThrust.cmake
-
64resources/cudaForStorm/CMakeAlignmentCheck.cpp
-
31resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
-
294resources/cudaForStorm/CMakeLists.txt
-
6resources/cudaForStorm/srcCuda/allCudaKernels.h
-
0resources/cudaForStorm/srcCuda/bandWidth.cu
-
0resources/cudaForStorm/srcCuda/bandWidth.h
-
286resources/cudaForStorm/srcCuda/basicAdd.cu
-
9resources/cudaForStorm/srcCuda/basicAdd.h
-
879resources/cudaForStorm/srcCuda/basicValueIteration.cu
-
107resources/cudaForStorm/srcCuda/basicValueIteration.h
-
19resources/cudaForStorm/srcCuda/cudaForStorm.h
-
49resources/cudaForStorm/srcCuda/cuspExtension.h
-
361resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
-
375resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
-
39resources/cudaForStorm/srcCuda/kernelSwitchTest.cu
-
1resources/cudaForStorm/srcCuda/kernelSwitchTest.h
-
33resources/cudaForStorm/srcCuda/utility.cu
-
12resources/cudaForStorm/srcCuda/utility.h
-
28resources/cudaForStorm/srcCuda/version.cu
-
16resources/cudaForStorm/srcCuda/version.h
-
21resources/cudaForStorm/storm-cudaplugin-config.h.in
-
10src/counterexamples/PathBasedSubsystemGenerator.h
-
101src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h
-
111src/models/PseudoModel.cpp
-
90src/models/PseudoModel.h
-
3src/solver/NativeNondeterministicLinearEquationSolver.cpp
-
2src/solver/NativeNondeterministicLinearEquationSolver.h
-
467src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
-
97src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
-
10src/utility/graph.h
-
19src/utility/vector.h
-
3storm-config.h.in
-
240test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
-
354test/functional/solver/CudaPluginTest.cpp
-
163test/performance/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -0,0 +1,3 @@ |
|||
[submodule "resources/3rdparty/cusplibrary"] |
|||
path = resources/3rdparty/cusplibrary |
|||
url = https://github.com/cusplibrary/cusplibrary.git |
@ -0,0 +1,2 @@ |
|||
Pmin=? [ (!statetwo) U end ] |
|||
Pmax=? [ (!statetwo) U end ] |
@ -0,0 +1 @@ |
|||
Subproject commit d8d7d9e97add8db08ef0ad5c0a7e9929fd83ce3c |
@ -0,0 +1,56 @@ |
|||
# |
|||
# FindCusp |
|||
# |
|||
# This module finds the CUSP header files and extracts their version. It |
|||
# sets the following variables. |
|||
# |
|||
# CUSP_INCLUDE_DIR - Include directory for cusp header files. (All header |
|||
# files will actually be in the cusp subdirectory.) |
|||
# CUSP_VERSION - Version of cusp in the form "major.minor.patch". |
|||
# |
|||
# CUSP_FOUND - Indicates whether Cusp has been found |
|||
# |
|||
|
|||
find_path(CUSP_INCLUDE_DIR |
|||
HINTS |
|||
/usr/include/cusp |
|||
/usr/local/include |
|||
/usr/local/cusp/include |
|||
${CUSP_INCLUDE_DIRS} |
|||
${CUSP_HINT} |
|||
NAMES cusp/version.h |
|||
DOC "Cusp headers" |
|||
) |
|||
if(CUSP_INCLUDE_DIR) |
|||
list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR) |
|||
|
|||
# Find cusp version |
|||
file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h |
|||
version |
|||
REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)" |
|||
) |
|||
string(REGEX REPLACE |
|||
"#define CUSP_VERSION[ \t]+" |
|||
"" |
|||
version |
|||
"${version}" |
|||
) |
|||
|
|||
#define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000) |
|||
#define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000) |
|||
#define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100) |
|||
|
|||
math(EXPR CUSP_MAJOR_VERSION "${version} / 100000") |
|||
math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000") |
|||
math(EXPR CUSP_PATCH_VERSION "${version} % 100") |
|||
|
|||
set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}") |
|||
|
|||
# Check for required components |
|||
include(FindPackageHandleStandardArgs) |
|||
find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION) |
|||
|
|||
set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR}) |
|||
mark_as_advanced(CUSP_INCLUDE_DIR) |
|||
|
|||
endif(CUSP_INCLUDE_DIR) |
@ -0,0 +1,52 @@ |
|||
# |
|||
# FindThrust |
|||
# |
|||
# This module finds the Thrust header files and extracts their version. It |
|||
# sets the following variables. |
|||
# |
|||
# THRUST_INCLUDE_DIR - Include directory for thrust header files. (All header |
|||
# files will actually be in the thrust subdirectory.) |
|||
# THRUST_VERSION - Version of thrust in the form "major.minor.patch". |
|||
# |
|||
# Thrust_FOUND - Indicates whether Thrust has been found |
|||
# |
|||
|
|||
find_path(THRUST_INCLUDE_DIR |
|||
HINTS |
|||
/usr/include/cuda |
|||
/usr/local/include |
|||
/usr/local/cuda/include |
|||
${CUDA_INCLUDE_DIRS} |
|||
NAMES thrust/version.h |
|||
DOC "Thrust headers" |
|||
) |
|||
if(THRUST_INCLUDE_DIR) |
|||
list(REMOVE_DUPLICATES THRUST_INCLUDE_DIR) |
|||
endif(THRUST_INCLUDE_DIR) |
|||
|
|||
# Find thrust version |
|||
file(STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h |
|||
version |
|||
REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)" |
|||
) |
|||
string(REGEX REPLACE |
|||
"#define THRUST_VERSION[ \t]+" |
|||
"" |
|||
version |
|||
"${version}" |
|||
) |
|||
|
|||
string(REGEX MATCH "^[0-9]" major ${version}) |
|||
string(REGEX REPLACE "^${major}00" "" version "${version}") |
|||
string(REGEX MATCH "^[0-9]" minor ${version}) |
|||
string(REGEX REPLACE "^${minor}0" "" version "${version}") |
|||
set(THRUST_VERSION "${major}.${minor}.${version}") |
|||
set(THRUST_MAJOR_VERSION "${major}") |
|||
set(THRUST_MINOR_VERSION "${minor}") |
|||
|
|||
# Check for required components |
|||
include(FindPackageHandleStandardArgs) |
|||
find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION) |
|||
|
|||
set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR}) |
|||
mark_as_advanced(THRUST_INCLUDE_DIR) |
@ -0,0 +1,64 @@ |
|||
/*
|
|||
* This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code. |
|||
*/ |
|||
#include <cstdint>
|
|||
#include <utility>
|
|||
#include <vector>
|
|||
|
|||
#define CONTAINER_SIZE 100ul
|
|||
|
|||
template <typename IndexType, typename ValueType> |
|||
int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) { |
|||
std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>(); |
|||
for (size_t i = 0; i < containerSize; ++i) { |
|||
myVector->push_back(std::make_pair(firstValue, secondValue)); |
|||
} |
|||
size_t myVectorSize = myVector->size(); |
|||
IndexType* firstStart = &(myVector->at(0).first); |
|||
IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first); |
|||
ValueType* secondStart = &(myVector->at(0).second); |
|||
ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second); |
|||
size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart); |
|||
size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd); |
|||
size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart); |
|||
size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart); |
|||
|
|||
delete myVector; |
|||
myVector = nullptr; |
|||
|
|||
if (myVectorSize != containerSize) { |
|||
return -2; |
|||
} |
|||
|
|||
// Check for alignment:
|
|||
// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
|
|||
if (sizeof(IndexType) != sizeof(ValueType)) { |
|||
return -3; |
|||
} |
|||
if (startOffset != sizeof(IndexType)) { |
|||
return -4; |
|||
} |
|||
if (endOffset != sizeof(IndexType)) { |
|||
return -5; |
|||
} |
|||
if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { |
|||
return -6; |
|||
} |
|||
if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { |
|||
return -7; |
|||
} |
|||
|
|||
return 0; |
|||
} |
|||
|
|||
|
|||
int main(int argc, char* argv[]) { |
|||
int result = 0; |
|||
|
|||
result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14); |
|||
if (result != 0) { |
|||
return result; |
|||
} |
|||
|
|||
return 0; |
|||
} |
@ -0,0 +1,31 @@ |
|||
/*
|
|||
* This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries |
|||
*/ |
|||
#include <cstdint>
|
|||
#include <utility>
|
|||
#include <vector>
|
|||
|
|||
#define CONTAINER_SIZE 100ul
|
|||
|
|||
int main(int argc, char* argv[]) { |
|||
int result = 0; |
|||
|
|||
std::vector<std::pair<uint_fast64_t, float>> myVector; |
|||
for (size_t i = 0; i < CONTAINER_SIZE; ++i) { |
|||
myVector.push_back(std::make_pair(i, 42.12345f * i)); |
|||
} |
|||
|
|||
char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first)); |
|||
char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first)); |
|||
ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; |
|||
|
|||
if (uintDiff == (2 * sizeof(uint_fast64_t))) { |
|||
result = 2; |
|||
} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { |
|||
result = 3; |
|||
} else { |
|||
result = -5; |
|||
} |
|||
|
|||
return result; |
|||
} |
@ -0,0 +1,294 @@ |
|||
cmake_minimum_required (VERSION 2.8.6) |
|||
|
|||
# Set project name |
|||
project (cudaForStorm CXX C) |
|||
|
|||
# Set the version number |
|||
set (STORM_CPP_VERSION_MAJOR 1) |
|||
set (STORM_CPP_VERSION_MINOR 0) |
|||
|
|||
# Add base folder for better inclusion paths |
|||
include_directories("${PROJECT_SOURCE_DIR}") |
|||
include_directories("${PROJECT_SOURCE_DIR}/src") |
|||
|
|||
message(STATUS "StoRM (CudaPlugin) - CUDA_PATH is ${CUDA_PATH} or $ENV{CUDA_PATH}") |
|||
|
|||
############################################################# |
|||
## |
|||
## CMake options of StoRM |
|||
## |
|||
############################################################# |
|||
option(CUDAFORSTORM_DEBUG "Sets whether the DEBUG mode is used" ON) |
|||
option(LINK_LIBCXXABI "Sets whether libc++abi should be linked." OFF) |
|||
option(USE_LIBCXX "Sets whether the standard library is libc++." OFF) |
|||
|
|||
set(ADDITIONAL_INCLUDE_DIRS "" CACHE STRING "Additional directories added to the include directories.") |
|||
set(ADDITIONAL_LINK_DIRS "" CACHE STRING "Additional directories added to the link directories.") |
|||
set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE STRING "The Build directory of storm, where the library files should be installed to (if available).") |
|||
|
|||
############################################################# |
|||
## |
|||
## Inclusion of required libraries |
|||
## |
|||
############################################################# |
|||
|
|||
# Add the resources/cmake folder to Module Search Path for FindTBB.cmake |
|||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/") |
|||
|
|||
# Set the hint for CUSP |
|||
set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary") |
|||
|
|||
find_package(CUDA REQUIRED) |
|||
find_package(Cusp REQUIRED) |
|||
find_package(Doxygen REQUIRED) |
|||
find_package(Thrust REQUIRED) |
|||
|
|||
# If the DEBUG option was turned on, we will target a debug version and a release version otherwise |
|||
if (CUDAFORSTORM_DEBUG) |
|||
set (CMAKE_BUILD_TYPE "DEBUG") |
|||
else() |
|||
set (CMAKE_BUILD_TYPE "RELEASE") |
|||
endif() |
|||
message(STATUS "StoRM (CudaPlugin) - Building ${CMAKE_BUILD_TYPE} version.") |
|||
|
|||
message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") |
|||
message(STATUS "StoRM (CudaPlugin) - CMAKE_BUILD_TYPE (ENV): $ENV{CMAKE_BUILD_TYPE}") |
|||
|
|||
############################################################# |
|||
## |
|||
## CUDA Options |
|||
## |
|||
############################################################# |
|||
SET (CUDA_VERBOSE_BUILD ON CACHE BOOL "nvcc verbose" FORCE) |
|||
set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) |
|||
set(BUILD_SHARED_LIBS OFF) |
|||
set(CUDA_SEPARABLE_COMPILATION ON) |
|||
#set(CUDA_NVCC_FLAGS "-arch=sm_30") |
|||
|
|||
# Because the FindCUDA.cmake file has a path related bug, two folders have to be present |
|||
file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Debug") |
|||
file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/cudaForStorm.dir/Release") |
|||
|
|||
|
|||
############################################################# |
|||
## |
|||
## Compiler specific settings and definitions |
|||
## |
|||
############################################################# |
|||
if(CMAKE_COMPILER_IS_GNUCC) |
|||
message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: GCC") |
|||
# Set standard flags for GCC |
|||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops") |
|||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -pedantic") |
|||
elseif(MSVC) |
|||
message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: MSVC") |
|||
# required for GMM to compile, ugly error directive in their code |
|||
add_definitions(/D_SCL_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS) |
|||
# required as the PRCTL Parser bloats object files (COFF) beyond their maximum size (see http://msdn.microsoft.com/en-us/library/8578y171(v=vs.110).aspx) |
|||
add_definitions(/bigobj) |
|||
# required by GTest and PrismGrammar::createIntegerVariable |
|||
add_definitions(/D_VARIADIC_MAX=10) |
|||
# Windows.h breaks GMM in gmm_except.h because of its macro definition for min and max |
|||
add_definitions(/DNOMINMAX) |
|||
else(CLANG) |
|||
message(STATUS "StoRM (CudaPlugin) - Using Compiler Configuration: Clang (LLVM)") |
|||
# As CLANG is not set as a variable, we need to set it in case we have not matched another compiler. |
|||
set (CLANG ON) |
|||
# Set standard flags for clang |
|||
set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -funroll-loops -O3") |
|||
if(UNIX AND NOT APPLE AND NOT USE_LIBCXX) |
|||
set(CLANG_STDLIB libstdc++) |
|||
message(STATUS "StoRM (CudaPlugin) - Linking against libstdc++") |
|||
else() |
|||
set(CLANG_STDLIB libc++) |
|||
message(STATUS "StoRM (CudaPlugin) - Linking against libc++") |
|||
# Set up some Xcode specific settings |
|||
set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") |
|||
set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") |
|||
endif() |
|||
|
|||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=${CLANG_STDLIB} -Wall -pedantic -Wno-unused-variable -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -ftemplate-depth=1024") |
|||
|
|||
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") |
|||
endif() |
|||
|
|||
############################################################# |
|||
## |
|||
## CMake-generated Config File for StoRM |
|||
## |
|||
############################################################# |
|||
|
|||
# Test for type alignment |
|||
try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT |
|||
${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp" |
|||
COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR |
|||
) |
|||
if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT) |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") |
|||
elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0) |
|||
message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.") |
|||
else() |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") |
|||
endif() |
|||
|
|||
# Test for Float 64bit Alignment |
|||
try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT |
|||
${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp" |
|||
COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR |
|||
) |
|||
if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT) |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") |
|||
elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2) |
|||
message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.") |
|||
set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define") |
|||
elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3) |
|||
message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.") |
|||
set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef") |
|||
else() |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})") |
|||
endif() |
|||
|
|||
|
|||
# |
|||
# Make a version file containing the current version from git. |
|||
# |
|||
include(GetGitRevisionDescription) |
|||
git_describe_checkout(STORM_GIT_VERSION_STRING) |
|||
# Parse the git Tag into variables |
|||
string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" STORM_CUDAPLUGIN_VERSION_MAJOR "${STORM_GIT_VERSION_STRING}") |
|||
string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_MINOR "${STORM_GIT_VERSION_STRING}") |
|||
string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_PATCH "${STORM_GIT_VERSION_STRING}") |
|||
string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+)\\-.*" "\\1" STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD "${STORM_GIT_VERSION_STRING}") |
|||
string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-([a-z0-9]+).*" "\\1" STORM_CUDAPLUGIN_VERSION_HASH "${STORM_GIT_VERSION_STRING}") |
|||
string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.[0-9]+\\-[0-9]+\\-[a-z0-9]+\\-(.*)" "\\1" STORM_CUDAPLUGIN_VERSION_APPENDIX "${STORM_GIT_VERSION_STRING}") |
|||
if ("${STORM_CUDAPLUGIN_VERSION_APPENDIX}" MATCHES "^.*dirty.*$") |
|||
set(STORM_CUDAPLUGIN_VERSION_DIRTY 1) |
|||
else() |
|||
set(STORM_CUDAPLUGIN_VERSION_DIRTY 0) |
|||
endif() |
|||
message(STATUS "StoRM (CudaPlugin) - Version information: ${STORM_CUDAPLUGIN_VERSION_MAJOR}.${STORM_CUDAPLUGIN_VERSION_MINOR}.${STORM_CUDAPLUGIN_VERSION_PATCH} (${STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD} commits ahead of Tag) build from ${STORM_CUDAPLUGIN_VERSION_HASH} (Dirty: ${STORM_CUDAPLUGIN_VERSION_DIRTY})") |
|||
|
|||
|
|||
# Configure a header file to pass some of the CMake settings to the source code |
|||
configure_file ( |
|||
"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in" |
|||
"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h" |
|||
) |
|||
# Add the binary dir include directory for storm-config.h |
|||
include_directories("${PROJECT_BINARY_DIR}/include") |
|||
|
|||
# Add the main source directory for includes |
|||
include_directories("${PROJECT_SOURCE_DIR}/../../src") |
|||
|
|||
############################################################# |
|||
## |
|||
## Source file aggregation and clustering |
|||
## |
|||
############################################################# |
|||
file(GLOB_RECURSE CUDAFORSTORM_HEADERS ${PROJECT_SOURCE_DIR}/src/*.h) |
|||
file(GLOB_RECURSE CUDAFORSTORM_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp) |
|||
|
|||
file(GLOB_RECURSE CUDAFORSTORM_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/srcCuda/*.cu") |
|||
file(GLOB_RECURSE CUDAFORSTORM_CUDA_HEADERS "${PROJECT_SOURCE_DIR}/srcCuda/*.h") |
|||
|
|||
# Additional include files like the storm-config.h |
|||
file(GLOB_RECURSE CUDAFORSTORM_BUILD_HEADERS ${PROJECT_BINARY_DIR}/include/*.h) |
|||
|
|||
# Group the headers and sources |
|||
source_group(main FILES ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES}) |
|||
source_group(cuda FILES ${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}) |
|||
|
|||
# Add custom additional include or link directories |
|||
if (ADDITIONAL_INCLUDE_DIRS) |
|||
message(STATUS "StoRM (CudaPlugin) - Using additional include directories ${ADDITIONAL_INCLUDE_DIRS}") |
|||
include_directories(${ADDITIONAL_INCLUDE_DIRS}) |
|||
endif(ADDITIONAL_INCLUDE_DIRS) |
|||
if (ADDITIONAL_LINK_DIRS) |
|||
message(STATUS "StoRM (CudaPlugin) - Using additional link directories ${ADDITIONAL_LINK_DIRS}") |
|||
link_directories(${ADDITIONAL_LINK_DIRS}) |
|||
endif(ADDITIONAL_LINK_DIRS) |
|||
|
|||
############################################################# |
|||
## |
|||
## Pre executable-creation link_directories setup |
|||
## |
|||
############################################################# |
|||
|
|||
|
|||
|
|||
############################################################# |
|||
## |
|||
## CUDA |
|||
## |
|||
############################################################# |
|||
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30) |
|||
#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY}) |
|||
#ADD_DEPENDENCIES(cudaForStorm cudaLibrary) |
|||
#target_link_libraries(cudaForStorm cudaLibrary) |
|||
message(STATUS "StoRM (CudaPlugin) - Found CUDA SDK in Version ${CUDA_VERSION_STRING}, sparse lib is ${CUDA_cusparse_LIBRARY}") |
|||
include_directories(${CUDA_INCLUDE_DIRS}) |
|||
|
|||
############################################################# |
|||
## |
|||
## CUSP |
|||
## |
|||
############################################################# |
|||
if(CUSP_FOUND) |
|||
include_directories(${CUSP_INCLUDE_DIR}) |
|||
cuda_include_directories(${CUSP_INCLUDE_DIR}) |
|||
message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}") |
|||
else() |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!") |
|||
endif() |
|||
|
|||
############################################################# |
|||
## |
|||
## Thrust |
|||
## |
|||
############################################################# |
|||
if(THRUST_FOUND) |
|||
include_directories(${THRUST_INCLUDE_DIR}) |
|||
cuda_include_directories(${THRUST_INCLUDE_DIR}) |
|||
message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}") |
|||
else() |
|||
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.") |
|||
endif() |
|||
|
|||
############################################################################### |
|||
## # |
|||
## Executable Creation # |
|||
## # |
|||
## All link_directories() calls AND include_directories() calls # |
|||
## MUST be made before this point # |
|||
## # |
|||
############################################################################### |
|||
include (GenerateExportHeader) |
|||
|
|||
cuda_add_library(cudaForStorm SHARED |
|||
${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS} |
|||
OPTIONS -DSTUFF="" -arch=sm_30 |
|||
RELEASE -DNDEBUG |
|||
DEBUG -g -DDEBUG |
|||
) |
|||
GENERATE_EXPORT_HEADER( cudaForStorm |
|||
BASE_NAME cudaForStorm |
|||
EXPORT_MACRO_NAME cudaForStorm_EXPORT |
|||
EXPORT_FILE_NAME include/cudaForStorm_Export.h |
|||
STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC |
|||
) |
|||
|
|||
if (MSVC) |
|||
# Add the DebugHelper DLL |
|||
set(CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES} Dbghelp.lib") |
|||
target_link_libraries(cudaForStorm "Dbghelp.lib") |
|||
endif(MSVC) |
|||
|
|||
# Link against libc++abi if requested. May be needed to build on Linux systems using clang. |
|||
if (LINK_LIBCXXABI) |
|||
message (STATUS "StoRM (CudaPlugin) - Linking against libc++abi.") |
|||
target_link_libraries(cudaForStorm "c++abi") |
|||
endif(LINK_LIBCXXABI) |
|||
|
|||
# Install Directive |
|||
install(TARGETS cudaForStorm DESTINATION "${STORM_LIB_INSTALL_DIR}/lib") |
|||
install(FILES "${PROJECT_SOURCE_DIR}/srcCuda/cudaForStorm.h" "${PROJECT_BINARY_DIR}/include/cudaForStorm_Export.h" DESTINATION "${STORM_LIB_INSTALL_DIR}/include") |
@ -0,0 +1,6 @@ |
|||
#include "utility.h" |
|||
#include "bandWidth.h" |
|||
#include "basicAdd.h" |
|||
#include "kernelSwitchTest.h" |
|||
#include "basicValueIteration.h" |
|||
#include "version.h" |
@ -0,0 +1,286 @@ |
|||
#include <cuda.h> |
|||
#include <stdlib.h> |
|||
#include <stdio.h> |
|||
|
|||
#include <chrono> |
|||
#include <iostream> |
|||
|
|||
__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { |
|||
*c = a + b; |
|||
} |
|||
|
|||
__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
|||
// Fused Multiply Add: |
|||
// A * B + C => D |
|||
|
|||
/* |
|||
*Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd |
|||
*ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche |
|||
*Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente |
|||
*des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des |
|||
*Threads, der die Funktion gerade ausführt. |
|||
*/ |
|||
int i = blockDim.x * blockIdx.x + threadIdx.x; |
|||
|
|||
if (i < N) { |
|||
D[i] = A[i] * B[i] + C[i]; |
|||
} |
|||
} |
|||
|
|||
__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { |
|||
// Fused Multiply Add: |
|||
// A * B + C => D |
|||
|
|||
// Layout: |
|||
// A B C D A B C D A B C D |
|||
|
|||
int i = blockDim.x * blockIdx.x + threadIdx.x; |
|||
|
|||
if ((i*M) < N) { |
|||
for (int j = i*M; j < i*M + M; ++j) { |
|||
A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; |
|||
} |
|||
} |
|||
} |
|||
|
|||
extern "C" int cuda_basicAdd(int a, int b) { |
|||
int c = 0; |
|||
int *dev_c; |
|||
cudaMalloc((void**)&dev_c, sizeof(int)); |
|||
cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); |
|||
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); |
|||
//printf("%d + %d + 42 is %d\n", a, b, c); |
|||
cudaFree(dev_c); |
|||
return c; |
|||
} |
|||
|
|||
void cpp_cuda_bandwidthTest(int entryCount, int N) { |
|||
// Size of the Arrays |
|||
size_t arraySize = entryCount * sizeof(int); |
|||
|
|||
int* deviceIntArray; |
|||
int* hostIntArray = new int[arraySize]; |
|||
|
|||
// Allocate space on the device |
|||
auto start_time = std::chrono::high_resolution_clock::now(); |
|||
for (int i = 0; i < N; ++i) { |
|||
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
|||
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; |
|||
delete[] hostIntArray; |
|||
return; |
|||
} |
|||
// Free memory on device |
|||
if (cudaFree(deviceIntArray) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
delete[] hostIntArray; |
|||
return; |
|||
} |
|||
} |
|||
auto end_time = std::chrono::high_resolution_clock::now(); |
|||
auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
|||
double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
|||
std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
|||
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; |
|||
|
|||
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
|||
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; |
|||
delete[] hostIntArray; |
|||
return; |
|||
} |
|||
|
|||
// Prepare data |
|||
for (int i = 0; i < N; ++i) { |
|||
hostIntArray[i] = i * 333 + 123; |
|||
} |
|||
|
|||
// Copy data TO device |
|||
start_time = std::chrono::high_resolution_clock::now(); |
|||
for (int i = 0; i < N; ++i) { |
|||
if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
|||
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; |
|||
// Free memory on device |
|||
if (cudaFree(deviceIntArray) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
} |
|||
delete[] hostIntArray; |
|||
return; |
|||
} |
|||
} |
|||
end_time = std::chrono::high_resolution_clock::now(); |
|||
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
|||
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
|||
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
|||
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; |
|||
|
|||
// Copy data FROM device |
|||
start_time = std::chrono::high_resolution_clock::now(); |
|||
for (int i = 0; i < N; ++i) { |
|||
if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
|||
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; |
|||
// Free memory on device |
|||
if (cudaFree(deviceIntArray) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
} |
|||
delete[] hostIntArray; |
|||
return; |
|||
} |
|||
} |
|||
end_time = std::chrono::high_resolution_clock::now(); |
|||
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
|||
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
|||
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
|||
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; |
|||
|
|||
// Free memory on device |
|||
if (cudaFree(deviceIntArray) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
} |
|||
delete[] hostIntArray; |
|||
} |
|||
|
|||
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
|||
// Size of the Arrays |
|||
size_t arraySize = N * sizeof(int); |
|||
|
|||
int* deviceIntArrayA; |
|||
int* deviceIntArrayB; |
|||
int* deviceIntArrayC; |
|||
int* deviceIntArrayD; |
|||
|
|||
// Allocate space on the device |
|||
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
|||
printf("Error in cudaMalloc1!\n"); |
|||
return; |
|||
} |
|||
if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { |
|||
printf("Error in cudaMalloc2!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
return; |
|||
} |
|||
if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { |
|||
printf("Error in cudaMalloc3!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
return; |
|||
} |
|||
if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { |
|||
printf("Error in cudaMalloc4!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
return; |
|||
} |
|||
|
|||
// Copy data TO device |
|||
if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
cudaFree(deviceIntArrayD); |
|||
return; |
|||
} |
|||
if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
cudaFree(deviceIntArrayD); |
|||
return; |
|||
} |
|||
if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
cudaFree(deviceIntArrayD); |
|||
return; |
|||
} |
|||
|
|||
// Festlegung der Threads pro Block |
|||
int threadsPerBlock = 512; |
|||
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
|||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
|||
|
|||
// Run kernel |
|||
cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); |
|||
|
|||
// Copy data FROM device |
|||
if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
cudaFree(deviceIntArrayD); |
|||
return; |
|||
} |
|||
|
|||
// Free memory on device |
|||
cudaFree(deviceIntArrayA); |
|||
cudaFree(deviceIntArrayB); |
|||
cudaFree(deviceIntArrayC); |
|||
cudaFree(deviceIntArrayD); |
|||
} |
|||
|
|||
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { |
|||
// Size of the Arrays |
|||
size_t arraySize = N * sizeof(int) * 4; |
|||
|
|||
int* deviceIntArrayA; |
|||
|
|||
// Allocate space on the device |
|||
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
|||
printf("Error in cudaMalloc1!\n"); |
|||
return; |
|||
} |
|||
|
|||
#define ONFAILFREE0() do { } while(0) |
|||
#define ONFAILFREE1(a) do { cudaFree(a); } while(0) |
|||
#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) |
|||
#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) |
|||
#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) |
|||
#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) |
|||
|
|||
// Copy data TO device |
|||
|
|||
CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); |
|||
|
|||
/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
return; |
|||
}*/ |
|||
|
|||
// Festlegung der Threads pro Block |
|||
int threadsPerBlock = 512; |
|||
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
|||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
|||
|
|||
// Run kernel |
|||
cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M); |
|||
|
|||
// Copy data FROM device |
|||
if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
|||
printf("Error in cudaMemcpy!\n"); |
|||
cudaFree(deviceIntArrayA); |
|||
return; |
|||
} |
|||
|
|||
// Free memory on device |
|||
if (cudaFree(deviceIntArrayA) != cudaSuccess) { |
|||
printf("Error in cudaFree!\n"); |
|||
return; |
|||
} |
|||
} |
|||
|
|||
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
|||
for (int i = 0; i < N; ++i) { |
|||
D[i] = A[i] * B[i] + C[i]; |
|||
} |
|||
} |
|||
|
|||
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { |
|||
for (int i = 0; i < N; i += 4) { |
|||
A[i+3] = A[i] * A[i+1] + A[i+2]; |
|||
} |
|||
} |
@ -0,0 +1,9 @@ |
|||
extern "C" int cuda_basicAdd(int a, int b); |
|||
|
|||
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); |
|||
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); |
|||
|
|||
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
|||
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
|||
|
|||
void cpp_cuda_bandwidthTest(int entryCount, int N); |
@ -0,0 +1,879 @@ |
|||
#include "basicValueIteration.h" |
|||
#define CUSP_USE_TEXTURE_MEMORY |
|||
|
|||
#include <iostream> |
|||
#include <chrono> |
|||
|
|||
#include <cuda_runtime.h> |
|||
#include "cusparse_v2.h" |
|||
|
|||
#include "utility.h" |
|||
|
|||
#include "cuspExtension.h" |
|||
|
|||
#include <thrust/transform.h> |
|||
#include <thrust/device_ptr.h> |
|||
#include <thrust/functional.h> |
|||
|
|||
#include "storm-cudaplugin-config.h" |
|||
|
|||
#ifdef DEBUG |
|||
#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) |
|||
#else |
|||
#define CUDA_CHECK_ALL_ERRORS() do {} while (false) |
|||
#endif |
|||
|
|||
template<typename T, bool Relative> |
|||
struct equalModuloPrecision : public thrust::binary_function<T,T,T> |
|||
{ |
|||
__host__ __device__ T operator()(const T &x, const T &y) const |
|||
{ |
|||
if (Relative) { |
|||
if (y == 0) { |
|||
return ((x >= 0) ? (x) : (-x)); |
|||
} |
|||
const T result = (x - y) / y; |
|||
return ((result >= 0) ? (result) : (-result)); |
|||
} else { |
|||
const T result = (x - y); |
|||
return ((result >= 0) ? (result) : (-result)); |
|||
} |
|||
} |
|||
}; |
|||
|
|||
template<typename IndexType, typename ValueType> |
|||
void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) { |
|||
indexVector.reserve(inputVector.size()); |
|||
valueVector.reserve(inputVector.size()); |
|||
for (size_t i = 0; i < inputVector.size(); ++i) { |
|||
indexVector.push_back(inputVector.at(i).first); |
|||
valueVector.push_back(inputVector.at(i).second); |
|||
} |
|||
} |
|||
|
|||
// TEMPLATE VERSION |
|||
template <bool Minimize, bool Relative, typename IndexType, typename ValueType> |
|||
bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
//std::vector<IndexType> matrixColumnIndices; |
|||
//std::vector<ValueType> matrixValues; |
|||
//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues); |
|||
bool errorOccured = false; |
|||
|
|||
IndexType* device_matrixRowIndices = nullptr; |
|||
ValueType* device_matrixColIndicesAndValues = nullptr; |
|||
ValueType* device_x = nullptr; |
|||
ValueType* device_xSwap = nullptr; |
|||
ValueType* device_b = nullptr; |
|||
ValueType* device_multiplyResult = nullptr; |
|||
IndexType* device_nondeterministicChoiceIndices = nullptr; |
|||
|
|||
#ifdef DEBUG |
|||
std::cout.sync_with_stdio(true); |
|||
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; |
|||
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl; |
|||
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); |
|||
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; |
|||
#endif |
|||
|
|||
const IndexType matrixRowCount = matrixRowIndices.size() - 1; |
|||
const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; |
|||
const IndexType matrixNnzCount = columnIndicesAndValues.size(); |
|||
|
|||
cudaError_t cudaMallocResult; |
|||
|
|||
bool converged = false; |
|||
iterationCount = 0; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true |
|||
#else |
|||
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false |
|||
#endif |
|||
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
} else { |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished allocating memory." << std::endl; |
|||
#endif |
|||
|
|||
// Memory allocated, copy data to device |
|||
cudaError_t cudaCopyResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Copy all data as floats are expanded to 64bits :/ |
|||
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
} else { |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Preset the xSwap to zeros... |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Preset the multiplyResult to zeros... |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; |
|||
#endif |
|||
|
|||
// Data is on device, start Kernel |
|||
while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization |
|||
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b); |
|||
thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult); |
|||
|
|||
// Transform: Add multiplyResult + b inplace to multiplyResult |
|||
thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
// Reduce: Reduce multiplyResult to a new x vector |
|||
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
// Check for convergence |
|||
// Transform: x = abs(x - xSwap)/ xSwap |
|||
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x); |
|||
thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount); |
|||
thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap); |
|||
thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
// Reduce: get Max over x and check for res < Precision |
|||
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
converged = (maxX < precision); |
|||
++iterationCount; |
|||
|
|||
// Swap pointers, device_x always contains the most current result |
|||
std::swap(device_x, device_xSwap); |
|||
} |
|||
|
|||
if (!converged && (iterationCount == maxIterationCount)) { |
|||
iterationCount = 0; |
|||
errorOccured = true; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished kernel execution." << std::endl; |
|||
std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; |
|||
#endif |
|||
|
|||
// Get x back from the device |
|||
cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; |
|||
errorOccured = true; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished copying result data." << std::endl; |
|||
#endif |
|||
|
|||
// All code related to freeing memory and clearing up the device |
|||
cleanup: |
|||
if (device_matrixRowIndices != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_matrixRowIndices = nullptr; |
|||
} |
|||
if (device_matrixColIndicesAndValues != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_matrixColIndicesAndValues = nullptr; |
|||
} |
|||
if (device_x != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_x); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_x = nullptr; |
|||
} |
|||
if (device_xSwap != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_xSwap); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_xSwap = nullptr; |
|||
} |
|||
if (device_b != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_b); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_b = nullptr; |
|||
} |
|||
if (device_multiplyResult != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_multiplyResult = nullptr; |
|||
} |
|||
if (device_nondeterministicChoiceIndices != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; |
|||
errorOccured = true; |
|||
} |
|||
device_nondeterministicChoiceIndices = nullptr; |
|||
} |
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished cleanup." << std::endl; |
|||
#endif |
|||
|
|||
return !errorOccured; |
|||
} |
|||
|
|||
template <typename IndexType, typename ValueType> |
|||
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) { |
|||
IndexType* device_matrixRowIndices = nullptr; |
|||
ValueType* device_matrixColIndicesAndValues = nullptr; |
|||
ValueType* device_x = nullptr; |
|||
ValueType* device_multiplyResult = nullptr; |
|||
|
|||
#ifdef DEBUG |
|||
std::cout.sync_with_stdio(true); |
|||
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl; |
|||
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; |
|||
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); |
|||
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; |
|||
#endif |
|||
|
|||
const IndexType matrixRowCount = matrixRowIndices.size() - 1; |
|||
const IndexType matrixNnzCount = columnIndicesAndValues.size(); |
|||
|
|||
cudaError_t cudaMallocResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
#else |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
#endif |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished allocating memory." << std::endl; |
|||
#endif |
|||
|
|||
// Memory allocated, copy data to device |
|||
cudaError_t cudaCopyResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
#else |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
#endif |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Preset the multiplyResult to zeros... |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; |
|||
#endif |
|||
|
|||
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished kernel execution." << std::endl; |
|||
#endif |
|||
|
|||
// Get result back from the device |
|||
cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished copying result data." << std::endl; |
|||
#endif |
|||
|
|||
// All code related to freeing memory and clearing up the device |
|||
cleanup: |
|||
if (device_matrixRowIndices != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_matrixRowIndices = nullptr; |
|||
} |
|||
if (device_matrixColIndicesAndValues != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_matrixColIndicesAndValues = nullptr; |
|||
} |
|||
if (device_x != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_x); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_x = nullptr; |
|||
} |
|||
if (device_multiplyResult != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_multiplyResult = nullptr; |
|||
} |
|||
#ifdef DEBUG |
|||
std::cout << "(DLL) Finished cleanup." << std::endl; |
|||
#endif |
|||
} |
|||
|
|||
template <typename ValueType> |
|||
void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) { |
|||
ValueType* device_a = nullptr; |
|||
ValueType* device_b = nullptr; |
|||
|
|||
const size_t vectorSize = std::max(a.size(), b.size()); |
|||
|
|||
cudaError_t cudaMallocResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Memory allocated, copy data to device |
|||
cudaError_t cudaCopyResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
do { |
|||
// Transform: Add multiplyResult + b inplace to multiplyResult |
|||
thrust::device_ptr<ValueType> devicePtrThrust_a(device_a); |
|||
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b); |
|||
thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
} while (false); |
|||
|
|||
// Get result back from the device |
|||
cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// All code related to freeing memory and clearing up the device |
|||
cleanup: |
|||
if (device_a != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_a); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_a = nullptr; |
|||
} |
|||
if (device_b != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_b); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_b = nullptr; |
|||
} |
|||
} |
|||
|
|||
template <typename IndexType, typename ValueType, bool Minimize> |
|||
void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) { |
|||
ValueType* device_groupedVector = nullptr; |
|||
IndexType* device_grouping = nullptr; |
|||
ValueType* device_target = nullptr; |
|||
|
|||
const size_t groupedSize = groupedVector.size(); |
|||
const size_t groupingSize = grouping.size(); |
|||
const size_t targetSize = targetVector.size(); |
|||
|
|||
cudaError_t cudaMallocResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Memory allocated, copy data to device |
|||
cudaError_t cudaCopyResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
do { |
|||
// Reduce: Reduce multiplyResult to a new x vector |
|||
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
} while (false); |
|||
|
|||
// Get result back from the device |
|||
cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// All code related to freeing memory and clearing up the device |
|||
cleanup: |
|||
if (device_groupedVector != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_groupedVector); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_groupedVector = nullptr; |
|||
} |
|||
if (device_grouping != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_grouping); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_grouping = nullptr; |
|||
} |
|||
if (device_target != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_target); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_target = nullptr; |
|||
} |
|||
} |
|||
|
|||
template <typename ValueType, bool Relative> |
|||
void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) { |
|||
ValueType* device_x = nullptr; |
|||
ValueType* device_y = nullptr; |
|||
|
|||
const size_t vectorSize = x.size(); |
|||
|
|||
cudaError_t cudaMallocResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize); |
|||
if (cudaMallocResult != cudaSuccess) { |
|||
std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
// Memory allocated, copy data to device |
|||
cudaError_t cudaCopyResult; |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
|||
if (cudaCopyResult != cudaSuccess) { |
|||
std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; |
|||
goto cleanup; |
|||
} |
|||
|
|||
do { |
|||
// Transform: x = abs(x - xSwap)/ xSwap |
|||
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x); |
|||
thrust::device_ptr<ValueType> devicePtrThrust_y(device_y); |
|||
thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
|
|||
// Reduce: get Max over x and check for res < Precision |
|||
maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>()); |
|||
CUDA_CHECK_ALL_ERRORS(); |
|||
} while (false); |
|||
|
|||
// All code related to freeing memory and clearing up the device |
|||
cleanup: |
|||
if (device_x != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_x); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_x = nullptr; |
|||
} |
|||
if (device_y != nullptr) { |
|||
cudaError_t cudaFreeResult = cudaFree(device_y); |
|||
if (cudaFreeResult != cudaSuccess) { |
|||
std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; |
|||
} |
|||
device_y = nullptr; |
|||
} |
|||
} |
|||
|
|||
/* |
|||
* Declare and implement all exported functions for these Kernels here |
|||
* |
|||
*/ |
|||
|
|||
void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) { |
|||
basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); |
|||
} |
|||
|
|||
void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) { |
|||
basicValueIteration_addVectorsInplace<double>(a, b); |
|||
} |
|||
|
|||
void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) { |
|||
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector); |
|||
} |
|||
|
|||
void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) { |
|||
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector); |
|||
} |
|||
|
|||
void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) { |
|||
basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement); |
|||
} |
|||
|
|||
void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) { |
|||
basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement); |
|||
} |
|||
|
|||
// Float |
|||
void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) { |
|||
basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); |
|||
} |
|||
|
|||
void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) { |
|||
basicValueIteration_addVectorsInplace<float>(a, b); |
|||
} |
|||
|
|||
void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) { |
|||
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector); |
|||
} |
|||
|
|||
void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) { |
|||
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector); |
|||
} |
|||
|
|||
void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) { |
|||
basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement); |
|||
} |
|||
|
|||
void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) { |
|||
basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement); |
|||
} |
|||
|
|||
bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
if (relativePrecisionCheck) { |
|||
return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} else { |
|||
return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} |
|||
} |
|||
|
|||
bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
if (relativePrecisionCheck) { |
|||
return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} else { |
|||
return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} |
|||
} |
|||
|
|||
bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
if (relativePrecisionCheck) { |
|||
return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} else { |
|||
return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} |
|||
} |
|||
|
|||
bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
if (relativePrecisionCheck) { |
|||
return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} else { |
|||
return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
} |
|||
} |
|||
|
|||
size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { |
|||
size_t const valueTypeSize = sizeof(double); |
|||
size_t const indexTypeSize = sizeof(uint_fast64_t); |
|||
|
|||
/* |
|||
IndexType* device_matrixRowIndices = nullptr; |
|||
IndexType* device_matrixColIndices = nullptr; |
|||
ValueType* device_matrixValues = nullptr; |
|||
ValueType* device_x = nullptr; |
|||
ValueType* device_xSwap = nullptr; |
|||
ValueType* device_b = nullptr; |
|||
ValueType* device_multiplyResult = nullptr; |
|||
IndexType* device_nondeterministicChoiceIndices = nullptr; |
|||
*/ |
|||
|
|||
// Row Indices, Column Indices, Values, Choice Indices |
|||
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); |
|||
// Vectors x, xSwap, b, multiplyResult |
|||
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); |
|||
|
|||
return (matrixDataSize + vectorSizes); |
|||
} |
|||
|
|||
size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { |
|||
size_t const valueTypeSize = sizeof(float); |
|||
size_t const indexTypeSize = sizeof(uint_fast64_t); |
|||
|
|||
/* |
|||
IndexType* device_matrixRowIndices = nullptr; |
|||
IndexType* device_matrixColIndices = nullptr; |
|||
ValueType* device_matrixValues = nullptr; |
|||
ValueType* device_x = nullptr; |
|||
ValueType* device_xSwap = nullptr; |
|||
ValueType* device_b = nullptr; |
|||
ValueType* device_multiplyResult = nullptr; |
|||
IndexType* device_nondeterministicChoiceIndices = nullptr; |
|||
*/ |
|||
|
|||
// Row Indices, Column Indices, Values, Choice Indices |
|||
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); |
|||
// Vectors x, xSwap, b, multiplyResult |
|||
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); |
|||
|
|||
return (matrixDataSize + vectorSizes); |
|||
} |
@ -0,0 +1,107 @@ |
|||
#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
|||
#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
|||
|
|||
#include <cstdint> |
|||
#include <vector> |
|||
#include <utility> |
|||
|
|||
// Library exports |
|||
#include "cudaForStorm_Export.h" |
|||
|
|||
/* Helper declaration to cope with new internal format */ |
|||
#ifndef STORM_STORAGE_SPARSEMATRIX_H_ |
|||
namespace storm { |
|||
namespace storage { |
|||
template<typename T> |
|||
class MatrixEntry { |
|||
public: |
|||
/*! |
|||
* Constructs a matrix entry with the given column and value. |
|||
* |
|||
* @param column The column of the matrix entry. |
|||
* @param value The value of the matrix entry. |
|||
*/ |
|||
MatrixEntry(uint_fast64_t column, T value); |
|||
|
|||
/*! |
|||
* Move-constructs the matrix entry fro the given column-value pair. |
|||
* |
|||
* @param pair The column-value pair from which to move-construct the matrix entry. |
|||
*/ |
|||
MatrixEntry(std::pair<uint_fast64_t, T>&& pair); |
|||
|
|||
//MatrixEntry() = default; |
|||
//MatrixEntry(MatrixEntry const& other) = default; |
|||
//MatrixEntry& operator=(MatrixEntry const& other) = default; |
|||
#ifndef WINDOWS |
|||
//MatrixEntry(MatrixEntry&& other) = default; |
|||
//MatrixEntry& operator=(MatrixEntry&& other) = default; |
|||
#endif |
|||
|
|||
/*! |
|||
* Retrieves the column of the matrix entry. |
|||
* |
|||
* @return The column of the matrix entry. |
|||
*/ |
|||
uint_fast64_t const& getColumn() const; |
|||
|
|||
/*! |
|||
* Retrieves the column of the matrix entry. |
|||
* |
|||
* @return The column of the matrix entry. |
|||
*/ |
|||
uint_fast64_t& getColumn(); |
|||
|
|||
/*! |
|||
* Retrieves the value of the matrix entry. |
|||
* |
|||
* @return The value of the matrix entry. |
|||
*/ |
|||
T const& getValue() const; |
|||
|
|||
/*! |
|||
* Retrieves the value of the matrix entry. |
|||
* |
|||
* @return The value of the matrix entry. |
|||
*/ |
|||
T& getValue(); |
|||
|
|||
/*! |
|||
* Retrieves a pair of column and value that characterizes this entry. |
|||
* |
|||
* @return A column-value pair that characterizes this entry. |
|||
*/ |
|||
std::pair<uint_fast64_t, T> const& getColumnValuePair() const; |
|||
|
|||
private: |
|||
// The actual matrix entry. |
|||
std::pair<uint_fast64_t, T> entry; |
|||
}; |
|||
|
|||
} |
|||
} |
|||
#endif |
|||
|
|||
cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); |
|||
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
|||
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
|||
|
|||
cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); |
|||
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
|||
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
|||
|
|||
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b); |
|||
cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b); |
|||
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector); |
|||
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector); |
|||
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement); |
|||
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement); |
|||
|
|||
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b); |
|||
cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b); |
|||
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector); |
|||
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector); |
|||
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement); |
|||
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement); |
|||
|
|||
#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
@ -0,0 +1,19 @@ |
|||
#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
|||
#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
|||
|
|||
/* |
|||
* List of exported functions in this library |
|||
*/ |
|||
|
|||
// TopologicalValueIteration |
|||
#include "srcCuda/basicValueIteration.h" |
|||
|
|||
// Utility Functions |
|||
#include "srcCuda/utility.h" |
|||
|
|||
// Version Information |
|||
#include "srcCuda/version.h" |
|||
|
|||
|
|||
|
|||
#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
@ -0,0 +1,49 @@ |
|||
#pragma once |
|||
|
|||
#include "cuspExtensionFloat.h" |
|||
#include "cuspExtensionDouble.h" |
|||
|
|||
namespace cusp { |
|||
namespace detail { |
|||
namespace device { |
|||
|
|||
template <typename ValueType> |
|||
void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { |
|||
// |
|||
throw; |
|||
} |
|||
template <> |
|||
void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { |
|||
storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
template <> |
|||
void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { |
|||
storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
|
|||
template <bool Minimize, typename ValueType> |
|||
void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { |
|||
// |
|||
throw; |
|||
} |
|||
template <> |
|||
void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { |
|||
storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
template <> |
|||
void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { |
|||
storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
|
|||
template <> |
|||
void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { |
|||
storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
template <> |
|||
void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { |
|||
storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
|
|||
} // end namespace device |
|||
} // end namespace detail |
|||
} // end namespace cusp |
@ -0,0 +1,361 @@ |
|||
/* |
|||
* This is an extension of the original CUSP csr_vector.h SPMV implementation. |
|||
* It is based on the Code and incorporates changes as to cope with the details |
|||
* of the StoRM code. |
|||
* Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm |
|||
*/ |
|||
|
|||
/* |
|||
* Copyright 2008-2009 NVIDIA Corporation |
|||
* |
|||
* Licensed under the Apache License, Version 2.0 (the "License"); |
|||
* you may not use this file except in compliance with the License. |
|||
* You may obtain a copy of the License at |
|||
* |
|||
* http://www.apache.org/licenses/LICENSE-2.0 |
|||
* |
|||
* Unless required by applicable law or agreed to in writing, software |
|||
* distributed under the License is distributed on an "AS IS" BASIS, |
|||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
* See the License for the specific language governing permissions and |
|||
* limitations under the License. |
|||
*/ |
|||
|
|||
|
|||
#pragma once |
|||
|
|||
#include <limits> |
|||
#include <cstdint> |
|||
#include <algorithm> |
|||
|
|||
#include <math_functions.h> |
|||
|
|||
#include <cusp/detail/device/spmv/csr_vector.h> |
|||
|
|||
namespace cusp |
|||
{ |
|||
namespace detail |
|||
{ |
|||
namespace device |
|||
{ |
|||
|
|||
////////////////////////////////////////////////////////////////////////////// |
|||
// CSR SpMV kernels based on a vector model (one warp per row) |
|||
////////////////////////////////////////////////////////////////////////////// |
|||
// |
|||
// spmv_csr_vector_device |
|||
// Each row of the CSR matrix is assigned to a warp. The warp computes |
|||
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with |
|||
// the x vector, in parallel. This division of work implies that |
|||
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous |
|||
// manner (but generally not aligned). On GT200 these accesses are |
|||
// coalesced, unlike kernels based on the one-row-per-thread division of |
|||
// work. Since an entire 32-thread warp is assigned to each row, many |
|||
// threads will remain idle when their row contains a small number |
|||
// of elements. This code relies on implicit synchronization among |
|||
// threads in a warp. |
|||
// |
|||
// spmv_csr_vector_tex_device |
|||
// Same as spmv_csr_vector_tex_device, except that the texture cache is |
|||
// used for accessing the x vector. |
|||
// |
|||
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] |
|||
|
|||
|
|||
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache> |
|||
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) |
|||
__global__ void |
|||
storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) |
|||
{ |
|||
__shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals |
|||
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; |
|||
|
|||
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; |
|||
|
|||
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
|||
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector |
|||
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index |
|||
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block |
|||
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors |
|||
|
|||
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
|||
{ |
|||
// use two threads to fetch Ap[row] and Ap[row+1] |
|||
// this is considerably faster than the straightforward version |
|||
if(thread_lane < 2) |
|||
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; |
|||
|
|||
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
|||
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
|||
|
|||
// initialize local sum |
|||
double sum = 0; |
|||
|
|||
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) |
|||
{ |
|||
// ensure aligned memory access to Aj and Ax |
|||
|
|||
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; |
|||
|
|||
// accumulate local sums |
|||
if(jj >= row_start && jj < row_end) { |
|||
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
} |
|||
|
|||
// accumulate local sums |
|||
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
|||
} |
|||
} else { |
|||
// accumulate local sums |
|||
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
|||
} |
|||
} |
|||
|
|||
// store local sum in shared memory |
|||
sdata[threadIdx.x] = sum; |
|||
|
|||
// reduce local sums to row sum |
|||
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; |
|||
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; |
|||
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; |
|||
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; |
|||
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; |
|||
|
|||
// first thread writes the result |
|||
if (thread_lane == 0) |
|||
y[row] = sdata[threadIdx.x]; |
|||
} |
|||
} |
|||
|
|||
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize> |
|||
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) |
|||
__global__ void |
|||
storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) |
|||
{ |
|||
__shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals |
|||
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; |
|||
|
|||
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; |
|||
|
|||
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
|||
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector |
|||
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index |
|||
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block |
|||
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors |
|||
|
|||
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
|||
{ |
|||
// use two threads to fetch Ap[row] and Ap[row+1] |
|||
// this is considerably faster than the straightforward version |
|||
if(thread_lane < 2) |
|||
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; |
|||
|
|||
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
|||
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
|||
|
|||
// initialize local Min/Max |
|||
double localMinMaxElement = minMaxInitializer; |
|||
|
|||
if (THREADS_PER_ROW == 32 && row_end - row_start > 32) |
|||
{ |
|||
// ensure aligned memory access to Aj and Ax |
|||
|
|||
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; |
|||
|
|||
// accumulate local sums |
|||
if(jj >= row_start && jj < row_end) { |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
|
|||
// accumulate local sums |
|||
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// accumulate local sums |
|||
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
|
|||
// store local sum in shared memory |
|||
sdata[threadIdx.x] = localMinMaxElement; |
|||
|
|||
// reduce local min/max to row min/max |
|||
if (Minimize) { |
|||
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
|||
|
|||
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); |
|||
} else { |
|||
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
|||
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); |
|||
} |
|||
|
|||
// first thread writes the result |
|||
if (thread_lane == 0) |
|||
x[row] = sdata[threadIdx.x]; |
|||
} |
|||
} |
|||
|
|||
template <bool Minimize, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) |
|||
{ |
|||
double __minMaxInitializer = -std::numeric_limits<double>::max(); |
|||
if (Minimize) { |
|||
__minMaxInitializer = std::numeric_limits<double>::max(); |
|||
} |
|||
const double minMaxInitializer = __minMaxInitializer; |
|||
|
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); |
|||
} |
|||
|
|||
template <bool Minimize> |
|||
void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) |
|||
{ |
|||
const uint_fast64_t rows_per_group = num_entries / num_rows; |
|||
|
|||
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
|
|||
__storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
|
|||
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
|||
{ |
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
if (UseCache) |
|||
bind_x(x); |
|||
|
|||
storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
|
|||
if (UseCache) |
|||
unbind_x(x); |
|||
} |
|||
|
|||
void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
|
|||
__storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
|
|||
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
|
|||
__storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
|
|||
// NON-OPT |
|||
|
|||
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
|||
{ |
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
if (UseCache) |
|||
bind_x(x); |
|||
|
|||
spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
|
|||
if (UseCache) |
|||
unbind_x(x); |
|||
} |
|||
|
|||
void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
|
|||
__storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
} |
|||
|
|||
void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
|
|||
__storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
} |
|||
|
|||
} // end namespace device |
|||
} // end namespace detail |
|||
} // end namespace cusp |
@ -0,0 +1,375 @@ |
|||
/* |
|||
* This is an extension of the original CUSP csr_vector.h SPMV implementation. |
|||
* It is based on the Code and incorporates changes as to cope with the details |
|||
* of the StoRM code. |
|||
* As this is mostly copy & paste, the original license still applies. |
|||
*/ |
|||
|
|||
/* |
|||
* Copyright 2008-2009 NVIDIA Corporation |
|||
* |
|||
* Licensed under the Apache License, Version 2.0 (the "License"); |
|||
* you may not use this file except in compliance with the License. |
|||
* You may obtain a copy of the License at |
|||
* |
|||
* http://www.apache.org/licenses/LICENSE-2.0 |
|||
* |
|||
* Unless required by applicable law or agreed to in writing, software |
|||
* distributed under the License is distributed on an "AS IS" BASIS, |
|||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
* See the License for the specific language governing permissions and |
|||
* limitations under the License. |
|||
*/ |
|||
|
|||
|
|||
#pragma once |
|||
|
|||
#include <limits> |
|||
#include <cstdint> |
|||
#include <algorithm> |
|||
|
|||
#include <math_functions.h> |
|||
|
|||
#include <cusp/detail/device/spmv/csr_vector.h> |
|||
|
|||
#include "storm-cudaplugin-config.h" |
|||
|
|||
namespace cusp |
|||
{ |
|||
namespace detail |
|||
{ |
|||
namespace device |
|||
{ |
|||
|
|||
////////////////////////////////////////////////////////////////////////////// |
|||
// CSR SpMV kernels based on a vector model (one warp per row) |
|||
////////////////////////////////////////////////////////////////////////////// |
|||
// |
|||
// spmv_csr_vector_device |
|||
// Each row of the CSR matrix is assigned to a warp. The warp computes |
|||
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with |
|||
// the x vector, in parallel. This division of work implies that |
|||
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous |
|||
// manner (but generally not aligned). On GT200 these accesses are |
|||
// coalesced, unlike kernels based on the one-row-per-thread division of |
|||
// work. Since an entire 32-thread warp is assigned to each row, many |
|||
// threads will remain idle when their row contains a small number |
|||
// of elements. This code relies on implicit synchronization among |
|||
// threads in a warp. |
|||
// |
|||
// spmv_csr_vector_tex_device |
|||
// Same as spmv_csr_vector_tex_device, except that the texture cache is |
|||
// used for accessing the x vector. |
|||
// |
|||
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] |
|||
|
|||
|
|||
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache> |
|||
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) |
|||
__global__ void |
|||
storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) |
|||
{ |
|||
__shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals |
|||
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; |
|||
|
|||
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; |
|||
|
|||
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
|||
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector |
|||
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index |
|||
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block |
|||
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors |
|||
|
|||
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
|||
{ |
|||
// use two threads to fetch Ap[row] and Ap[row+1] |
|||
// this is considerably faster than the straightforward version |
|||
if(thread_lane < 2) |
|||
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; |
|||
|
|||
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
|||
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
|||
|
|||
// initialize local sum |
|||
float sum = 0; |
|||
|
|||
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) |
|||
{ |
|||
// ensure aligned memory access to Aj and Ax |
|||
|
|||
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; |
|||
|
|||
// accumulate local sums |
|||
if(jj >= row_start && jj < row_end) { |
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
|||
#else |
|||
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
|||
#endif |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
} |
|||
|
|||
// accumulate local sums |
|||
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
|||
#else |
|||
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
|||
#endif |
|||
} |
|||
} else { |
|||
// accumulate local sums |
|||
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { |
|||
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
|||
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
|||
#else |
|||
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
|||
#endif |
|||
} |
|||
} |
|||
|
|||
// store local sum in shared memory |
|||
sdata[threadIdx.x] = sum; |
|||
|
|||
// reduce local sums to row sum |
|||
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; |
|||
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; |
|||
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; |
|||
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; |
|||
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; |
|||
|
|||
// first thread writes the result |
|||
if (thread_lane == 0) |
|||
y[row] = sdata[threadIdx.x]; |
|||
} |
|||
} |
|||
|
|||
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize> |
|||
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) |
|||
__global__ void |
|||
storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) |
|||
{ |
|||
__shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals |
|||
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; |
|||
|
|||
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; |
|||
|
|||
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
|||
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector |
|||
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index |
|||
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block |
|||
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors |
|||
|
|||
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
|||
{ |
|||
// use two threads to fetch Ap[row] and Ap[row+1] |
|||
// this is considerably faster than the straightforward version |
|||
if(thread_lane < 2) |
|||
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; |
|||
|
|||
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
|||
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
|||
|
|||
// initialize local Min/Max |
|||
float localMinMaxElement = minMaxInitializer; |
|||
|
|||
if (THREADS_PER_ROW == 32 && row_end - row_start > 32) |
|||
{ |
|||
// ensure aligned memory access to Aj and Ax |
|||
|
|||
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; |
|||
|
|||
// accumulate local sums |
|||
if(jj >= row_start && jj < row_end) { |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
|
|||
// accumulate local sums |
|||
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// accumulate local sums |
|||
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) |
|||
if(Minimize) { |
|||
localMinMaxElement = min(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
|||
} else { |
|||
localMinMaxElement = max(localMinMaxElement, y[jj]); |
|||
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
|||
} |
|||
} |
|||
|
|||
// store local sum in shared memory |
|||
sdata[threadIdx.x] = localMinMaxElement; |
|||
|
|||
// reduce local min/max to row min/max |
|||
if (Minimize) { |
|||
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
|||
|
|||
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); |
|||
} else { |
|||
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
|||
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); |
|||
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); |
|||
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); |
|||
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); |
|||
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); |
|||
} |
|||
|
|||
// first thread writes the result |
|||
if (thread_lane == 0) |
|||
x[row] = sdata[threadIdx.x]; |
|||
} |
|||
} |
|||
|
|||
template <bool Minimize, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) |
|||
{ |
|||
float __minMaxInitializer = -std::numeric_limits<float>::max(); |
|||
if (Minimize) { |
|||
__minMaxInitializer = std::numeric_limits<float>::max(); |
|||
} |
|||
const float minMaxInitializer = __minMaxInitializer; |
|||
|
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); |
|||
} |
|||
|
|||
template <bool Minimize> |
|||
void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) |
|||
{ |
|||
const uint_fast64_t rows_per_group = num_entries / num_rows; |
|||
|
|||
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
|||
|
|||
__storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y); |
|||
} |
|||
|
|||
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
|||
{ |
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
if (UseCache) |
|||
bind_x(x); |
|||
|
|||
storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
|
|||
if (UseCache) |
|||
unbind_x(x); |
|||
} |
|||
|
|||
void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
|
|||
__storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
|
|||
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
|||
|
|||
__storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
|||
} |
|||
|
|||
// NON-OPT |
|||
|
|||
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
|||
void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
|||
{ |
|||
const size_t THREADS_PER_BLOCK = 128; |
|||
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
|||
|
|||
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
|||
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
|||
|
|||
if (UseCache) |
|||
bind_x(x); |
|||
|
|||
spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
|||
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
|
|||
if (UseCache) |
|||
unbind_x(x); |
|||
} |
|||
|
|||
void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
|
|||
__storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
} |
|||
|
|||
void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
|||
{ |
|||
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
|||
|
|||
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
|||
|
|||
__storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
|||
} |
|||
|
|||
} // end namespace device |
|||
} // end namespace detail |
|||
} // end namespace cusp |
@ -0,0 +1,39 @@ |
|||
#include <iostream> |
|||
#include <chrono> |
|||
|
|||
__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { |
|||
*B = *A; |
|||
} |
|||
|
|||
void kernelSwitchTest(size_t N) { |
|||
int* deviceIntA; |
|||
int* deviceIntB; |
|||
|
|||
if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { |
|||
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
|||
return; |
|||
} |
|||
if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { |
|||
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
|||
return; |
|||
} |
|||
|
|||
// Allocate space on the device |
|||
auto start_time = std::chrono::high_resolution_clock::now(); |
|||
for (int i = 0; i < N; ++i) { |
|||
cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); |
|||
} |
|||
auto end_time = std::chrono::high_resolution_clock::now(); |
|||
std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
|||
std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; |
|||
|
|||
// Free memory on device |
|||
if (cudaFree(deviceIntA) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
return; |
|||
} |
|||
if (cudaFree(deviceIntB) != cudaSuccess) { |
|||
std::cout << "Error in cudaFree!" << std::endl; |
|||
return; |
|||
} |
|||
} |
@ -0,0 +1 @@ |
|||
void kernelSwitchTest(size_t N); |
@ -0,0 +1,33 @@ |
|||
#include "utility.h" |
|||
|
|||
#include <cuda_runtime.h> |
|||
|
|||
size_t getFreeCudaMemory() { |
|||
size_t freeMemory; |
|||
size_t totalMemory; |
|||
cudaMemGetInfo(&freeMemory, &totalMemory); |
|||
|
|||
return freeMemory; |
|||
} |
|||
|
|||
size_t getTotalCudaMemory() { |
|||
size_t freeMemory; |
|||
size_t totalMemory; |
|||
cudaMemGetInfo(&freeMemory, &totalMemory); |
|||
|
|||
return totalMemory; |
|||
} |
|||
|
|||
bool resetCudaDevice() { |
|||
cudaError_t result = cudaDeviceReset(); |
|||
return (result == cudaSuccess); |
|||
} |
|||
|
|||
int getRuntimeCudaVersion() { |
|||
int result = -1; |
|||
cudaError_t errorResult = cudaRuntimeGetVersion(&result); |
|||
if (errorResult != cudaSuccess) { |
|||
return -1; |
|||
} |
|||
return result; |
|||
} |
@ -0,0 +1,12 @@ |
|||
#ifndef STORM_CUDAFORSTORM_UTILITY_H_ |
|||
#define STORM_CUDAFORSTORM_UTILITY_H_ |
|||
|
|||
// Library exports |
|||
#include "cudaForStorm_Export.h" |
|||
|
|||
cudaForStorm_EXPORT size_t getFreeCudaMemory(); |
|||
cudaForStorm_EXPORT size_t getTotalCudaMemory(); |
|||
cudaForStorm_EXPORT bool resetCudaDevice(); |
|||
cudaForStorm_EXPORT int getRuntimeCudaVersion(); |
|||
|
|||
#endif // STORM_CUDAFORSTORM_UTILITY_H_ |
@ -0,0 +1,28 @@ |
|||
#include "version.h" |
|||
|
|||
#include "storm-cudaplugin-config.h" |
|||
|
|||
size_t getStormCudaPluginVersionMajor() { |
|||
return STORM_CUDAPLUGIN_VERSION_MAJOR; |
|||
} |
|||
|
|||
size_t getStormCudaPluginVersionMinor() { |
|||
return STORM_CUDAPLUGIN_VERSION_MINOR; |
|||
} |
|||
|
|||
size_t getStormCudaPluginVersionPatch() { |
|||
return STORM_CUDAPLUGIN_VERSION_PATCH; |
|||
} |
|||
|
|||
size_t getStormCudaPluginVersionCommitsAhead() { |
|||
return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD; |
|||
} |
|||
|
|||
const char* getStormCudaPluginVersionHash() { |
|||
static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH; |
|||
return versionHash.c_str(); |
|||
} |
|||
|
|||
bool getStormCudaPluginVersionIsDirty() { |
|||
return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0); |
|||
} |
@ -0,0 +1,16 @@ |
|||
#ifndef STORM_CUDAFORSTORM_VERSION_H_ |
|||
#define STORM_CUDAFORSTORM_VERSION_H_ |
|||
|
|||
// Library exports |
|||
#include "cudaForStorm_Export.h" |
|||
|
|||
#include <string> |
|||
|
|||
cudaForStorm_EXPORT size_t getStormCudaPluginVersionMajor(); |
|||
cudaForStorm_EXPORT size_t getStormCudaPluginVersionMinor(); |
|||
cudaForStorm_EXPORT size_t getStormCudaPluginVersionPatch(); |
|||
cudaForStorm_EXPORT size_t getStormCudaPluginVersionCommitsAhead(); |
|||
cudaForStorm_EXPORT const char* getStormCudaPluginVersionHash(); |
|||
cudaForStorm_EXPORT bool getStormCudaPluginVersionIsDirty(); |
|||
|
|||
#endif // STORM_CUDAFORSTORM_VERSION_H_ |
@ -0,0 +1,21 @@ |
|||
/* |
|||
* StoRM - Build-in Options |
|||
* |
|||
* This file is parsed by CMake during makefile generation |
|||
*/ |
|||
|
|||
#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
|||
#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
|||
|
|||
// Version Information |
|||
#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@ // The major version of StoRM |
|||
#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@ // The minor version of StoRM |
|||
#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@ // The patch version of StoRM |
|||
#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ // How many commits passed since the tag was last set |
|||
#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on |
|||
#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else |
|||
|
|||
// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit |
|||
#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
|||
|
|||
#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
@ -0,0 +1,111 @@ |
|||
#include "src/models/PseudoModel.h"
|
|||
#include "src/utility/constants.h"
|
|||
#include "src/models/AbstractModel.h"
|
|||
|
|||
namespace storm { |
|||
namespace models { |
|||
|
|||
template<typename ValueType> |
|||
ModelBasedPseudoModel<ValueType>::ModelBasedPseudoModel(storm::models::AbstractModel<ValueType> const& model) : _model(model) { |
|||
// Intentionally left empty.
|
|||
} |
|||
|
|||
template<typename ValueType> |
|||
NonDeterministicMatrixBasedPseudoModel<ValueType>::NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) : _matrix(matrix), _nondeterministicChoiceIndices(nondeterministicChoiceIndices) { |
|||
// Intentionally left empty.
|
|||
} |
|||
|
|||
template<typename ValueType> |
|||
DeterministicMatrixBasedPseudoModel<ValueType>::DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix) : _matrix(matrix) { |
|||
// Intentionally left empty.
|
|||
} |
|||
|
|||
template<typename ValueType> |
|||
typename storm::storage::SparseMatrix<ValueType>::const_rows |
|||
ModelBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const { |
|||
return this->_model.getRows(state); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
typename storm::storage::SparseMatrix<ValueType>::const_rows |
|||
NonDeterministicMatrixBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const { |
|||
return this->_matrix.getRows(this->_nondeterministicChoiceIndices[state], this->_nondeterministicChoiceIndices[state + 1] - 1); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
typename storm::storage::SparseMatrix<ValueType>::const_rows |
|||
DeterministicMatrixBasedPseudoModel<ValueType>::getRows(uint_fast64_t state) const { |
|||
return this->_matrix.getRows(state, state); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
uint_fast64_t |
|||
ModelBasedPseudoModel<ValueType>::getNumberOfStates() const { |
|||
return this->_model.getNumberOfStates(); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
uint_fast64_t |
|||
NonDeterministicMatrixBasedPseudoModel<ValueType>::getNumberOfStates() const { |
|||
return this->_matrix.getColumnCount(); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
uint_fast64_t |
|||
DeterministicMatrixBasedPseudoModel<ValueType>::getNumberOfStates() const { |
|||
return this->_matrix.getColumnCount(); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
storm::storage::SparseMatrix<ValueType> |
|||
AbstractPseudoModel<ValueType>::extractPartitionDependencyGraph(storm::storage::Decomposition<storm::storage::StateBlock> const& decomposition) const { |
|||
uint_fast64_t numberOfStates = decomposition.size(); |
|||
|
|||
// First, we need to create a mapping of states to their SCC index, to ease the computation of dependency transitions later.
|
|||
std::vector<uint_fast64_t> stateToBlockMap(this->getNumberOfStates()); |
|||
for (uint_fast64_t i = 0; i < decomposition.size(); ++i) { |
|||
for (auto state : decomposition[i]) { |
|||
stateToBlockMap[state] = i; |
|||
} |
|||
} |
|||
|
|||
// The resulting sparse matrix will have as many rows/columns as there are blocks in the partition.
|
|||
storm::storage::SparseMatrixBuilder<ValueType> dependencyGraphBuilder(numberOfStates, numberOfStates); |
|||
|
|||
for (uint_fast64_t currentBlockIndex = 0; currentBlockIndex < decomposition.size(); ++currentBlockIndex) { |
|||
// Get the next block.
|
|||
typename storm::storage::StateBlock const& block = decomposition[currentBlockIndex]; |
|||
|
|||
// Now, we determine the blocks which are reachable (in one step) from the current block.
|
|||
boost::container::flat_set<uint_fast64_t> allTargetBlocks; |
|||
for (auto state : block) { |
|||
for (auto const& transitionEntry : this->getRows(state)) { |
|||
uint_fast64_t targetBlock = stateToBlockMap[transitionEntry.getColumn()]; |
|||
|
|||
// We only need to consider transitions that are actually leaving the SCC.
|
|||
if (targetBlock != currentBlockIndex) { |
|||
allTargetBlocks.insert(targetBlock); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Now we can just enumerate all the target SCCs and insert the corresponding transitions.
|
|||
for (auto targetBlock : allTargetBlocks) { |
|||
dependencyGraphBuilder.addNextValue(currentBlockIndex, targetBlock, storm::utility::constantOne<ValueType>()); |
|||
} |
|||
} |
|||
|
|||
return dependencyGraphBuilder.build(); |
|||
} |
|||
|
|||
template class ModelBasedPseudoModel<double>; |
|||
template class NonDeterministicMatrixBasedPseudoModel<double>; |
|||
template class DeterministicMatrixBasedPseudoModel<double>; |
|||
template class ModelBasedPseudoModel <float> ; |
|||
template class NonDeterministicMatrixBasedPseudoModel <float>; |
|||
template class DeterministicMatrixBasedPseudoModel <float>; |
|||
template class ModelBasedPseudoModel<int>; |
|||
template class NonDeterministicMatrixBasedPseudoModel<int>; |
|||
template class DeterministicMatrixBasedPseudoModel<int>; |
|||
} // namespace models
|
|||
} // namespace storm
|
@ -0,0 +1,90 @@ |
|||
#ifndef STORM_MODELS_PSEUDOMODEL_H_ |
|||
#define STORM_MODELS_PSEUDOMODEL_H_ |
|||
|
|||
#include <cstdint> |
|||
#include "src/storage/SparseMatrix.h" |
|||
#include "src/storage/Decomposition.h" |
|||
|
|||
namespace storm { |
|||
namespace models { |
|||
// Forward declare the abstract model class. |
|||
template <typename ValueType> class AbstractModel; |
|||
|
|||
/*! |
|||
* This classes encapsulate the model/transitionmatrix on which the SCC decomposition is performed. |
|||
* The Abstract Base class is specialized by the two possible representations: |
|||
* - For a model the implementation ModelBasedPseudoModel hands the call to getRows() through to the model |
|||
* - For a matrix of a nondeterministic model the implementation NonDeterministicMatrixBasedPseudoModel emulates the call |
|||
* on the matrix itself like the model function would |
|||
* - For a matrix of a deterministic model the implementation DeterministicMatrixBasedPseudoModel emulates the call |
|||
* on the matrix itself like the model function would |
|||
*/ |
|||
template <typename ValueType> |
|||
class AbstractPseudoModel { |
|||
public: |
|||
AbstractPseudoModel() {} |
|||
virtual ~AbstractPseudoModel() {} |
|||
|
|||
virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const = 0; |
|||
|
|||
/*! |
|||
* Calculates the number of states in the represented system. |
|||
* @return The Number of States in the underlying model/transition matrix |
|||
*/ |
|||
virtual uint_fast64_t getNumberOfStates() const = 0; |
|||
|
|||
/*! |
|||
* Extracts the dependency graph from a (pseudo-) model according to the given partition. |
|||
* |
|||
* @param decomposition A decomposition containing the blocks of the partition of the system. |
|||
* @return A sparse matrix with bool entries that represents the dependency graph of the blocks of the partition. |
|||
*/ |
|||
virtual storm::storage::SparseMatrix<ValueType> extractPartitionDependencyGraph(storm::storage::Decomposition<storm::storage::StateBlock> const& decomposition) const; |
|||
}; |
|||
|
|||
template <typename ValueType> |
|||
class ModelBasedPseudoModel : public AbstractPseudoModel<ValueType> { |
|||
public: |
|||
/*! |
|||
* Creates an encapsulation for the SCC decomposition based on a model |
|||
* @param model The Model on which the decomposition is to be performed |
|||
*/ |
|||
ModelBasedPseudoModel(storm::models::AbstractModel<ValueType> const& model); |
|||
virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override; |
|||
virtual uint_fast64_t getNumberOfStates() const override; |
|||
private: |
|||
storm::models::AbstractModel<ValueType> const& _model; |
|||
}; |
|||
|
|||
template <typename ValueType> |
|||
class NonDeterministicMatrixBasedPseudoModel : public AbstractPseudoModel<ValueType> { |
|||
public: |
|||
/*! |
|||
* Creates an encapsulation for the SCC decomposition based on a matrix |
|||
* @param matrix The Matrix on which the decomposition is to be performed |
|||
*/ |
|||
NonDeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices); |
|||
virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override; |
|||
virtual uint_fast64_t getNumberOfStates() const override; |
|||
private: |
|||
storm::storage::SparseMatrix<ValueType> const& _matrix; |
|||
std::vector<uint_fast64_t> const& _nondeterministicChoiceIndices; |
|||
}; |
|||
|
|||
template <typename ValueType> |
|||
class DeterministicMatrixBasedPseudoModel : public AbstractPseudoModel<ValueType> { |
|||
public: |
|||
/*! |
|||
* Creates an encapsulation for the SCC decomposition based on a matrix |
|||
* @param matrix The Matrix on which the decomposition is to be performed |
|||
*/ |
|||
DeterministicMatrixBasedPseudoModel(storm::storage::SparseMatrix<ValueType> const& matrix); |
|||
virtual typename storm::storage::SparseMatrix<ValueType>::const_rows getRows(uint_fast64_t state) const override; |
|||
virtual uint_fast64_t getNumberOfStates() const override; |
|||
private: |
|||
storm::storage::SparseMatrix<ValueType> const& _matrix; |
|||
}; |
|||
} |
|||
} |
|||
|
|||
#endif // STORM_MODELS_PSEUDOMODEL_H_ |
@ -0,0 +1,467 @@ |
|||
#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"
|
|||
|
|||
#include <utility>
|
|||
#include <chrono>
|
|||
|
|||
#include "src/settings/Settings.h"
|
|||
#include "src/utility/vector.h"
|
|||
#include "src/utility/graph.h"
|
|||
#include "src/models/PseudoModel.h"
|
|||
#include "src/storage/StronglyConnectedComponentDecomposition.h"
|
|||
#include "src/exceptions/IllegalArgumentException.h"
|
|||
#include "src/exceptions/InvalidStateException.h"
|
|||
|
|||
#include "log4cplus/logger.h"
|
|||
#include "log4cplus/loggingmacros.h"
|
|||
extern log4cplus::Logger logger; |
|||
|
|||
#include "storm-config.h"
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
# include "cudaForStorm.h"
|
|||
#endif
|
|||
|
|||
namespace storm { |
|||
namespace solver { |
|||
|
|||
template<typename ValueType> |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::TopologicalValueIterationNondeterministicLinearEquationSolver() { |
|||
// Get the settings object to customize solving.
|
|||
storm::settings::Settings* settings = storm::settings::Settings::getInstance(); |
|||
|
|||
// Get appropriate settings.
|
|||
this->maximalNumberOfIterations = settings->getOptionByLongName("maxiter").getArgument(0).getValueAsUnsignedInteger(); |
|||
this->precision = settings->getOptionByLongName("precision").getArgument(0).getValueAsDouble(); |
|||
this->relative = !settings->isSet("absolute"); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative) : NativeNondeterministicLinearEquationSolver<ValueType>(precision, maximalNumberOfIterations, relative) { |
|||
// Intentionally left empty.
|
|||
} |
|||
|
|||
template<typename ValueType> |
|||
NondeterministicLinearEquationSolver<ValueType>* TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::clone() const { |
|||
return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this); |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const { |
|||
|
|||
#ifdef GPU_USE_FLOAT
|
|||
#define __FORCE_FLOAT_CALCULATION true
|
|||
#else
|
|||
#define __FORCE_FLOAT_CALCULATION false
|
|||
#endif
|
|||
if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) { |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative); |
|||
|
|||
storm::storage::SparseMatrix<float> new_A = A.toValueType<float>(); |
|||
std::vector<float> new_x = storm::utility::vector::toValueType<float>(x); |
|||
std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b); |
|||
|
|||
tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); |
|||
|
|||
for (size_t i = 0, size = new_x.size(); i < size; ++i) { |
|||
x.at(i) = new_x.at(i); |
|||
} |
|||
return; |
|||
} |
|||
|
|||
// For testing only
|
|||
if (sizeof(ValueType) == sizeof(double)) { |
|||
std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl; |
|||
LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>"); |
|||
} else { |
|||
std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; |
|||
LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>"); |
|||
} |
|||
|
|||
// Now, we need to determine the SCCs of the MDP and perform a topological sort.
|
|||
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices(); |
|||
storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> const pseudoModel(A, nondeterministicChoiceIndices); |
|||
|
|||
// Check if the decomposition is necessary
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
#define __USE_CUDAFORSTORM_OPT true
|
|||
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A.getEntryCount())); |
|||
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95); |
|||
#else
|
|||
#define __USE_CUDAFORSTORM_OPT false
|
|||
size_t const gpuSizeOfCompleteSystem = 0; |
|||
size_t const cudaFreeMemory = 0; |
|||
#endif
|
|||
std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition; |
|||
if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) { |
|||
// Dummy output for SCC Times
|
|||
std::cout << "Computing the SCC Decomposition took 0ms" << std::endl; |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
if (!resetCudaDevice()) { |
|||
LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); |
|||
throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; |
|||
} |
|||
|
|||
std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); |
|||
bool result = false; |
|||
size_t globalIterations = 0; |
|||
if (minimize) { |
|||
result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); |
|||
} else { |
|||
result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations); |
|||
} |
|||
LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); |
|||
|
|||
bool converged = false; |
|||
if (!result) { |
|||
converged = false; |
|||
LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); |
|||
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; |
|||
} else { |
|||
converged = true; |
|||
} |
|||
|
|||
std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); |
|||
std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl; |
|||
|
|||
std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl; |
|||
|
|||
// Check if the solver converged and issue a warning otherwise.
|
|||
if (converged) { |
|||
LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations."); |
|||
} else { |
|||
LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations."); |
|||
} |
|||
#else
|
|||
LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); |
|||
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; |
|||
#endif
|
|||
} else { |
|||
std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now(); |
|||
storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false); |
|||
|
|||
if (sccDecomposition.size() == 0) { |
|||
LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); |
|||
throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; |
|||
} |
|||
|
|||
storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); |
|||
std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); |
|||
|
|||
// Calculate the optimal distribution of sccs
|
|||
std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); |
|||
LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); |
|||
|
|||
std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now(); |
|||
std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast<std::chrono::milliseconds>(sccEndTime - sccStartTime).count() << "ms." << std::endl; |
|||
|
|||
std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now(); |
|||
|
|||
std::vector<ValueType>* currentX = nullptr; |
|||
std::vector<ValueType>* swap = nullptr; |
|||
size_t currentMaxLocalIterations = 0; |
|||
size_t localIterations = 0; |
|||
size_t globalIterations = 0; |
|||
bool converged = true; |
|||
|
|||
// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
|
|||
// solved after all SCCs it depends on have been solved.
|
|||
int counter = 0; |
|||
|
|||
for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { |
|||
bool const useGpu = sccIndexIt->first; |
|||
storm::storage::StateBlock const& scc = sccIndexIt->second; |
|||
|
|||
// Generate a sub matrix
|
|||
storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); |
|||
storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); |
|||
std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount()); |
|||
storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); |
|||
std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount()); |
|||
std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount()); |
|||
std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount()); |
|||
|
|||
// Prepare the pointers for swapping in the calculation
|
|||
currentX = &sccSubX; |
|||
swap = &sccSubXSwap; |
|||
|
|||
storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
|
|||
std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); |
|||
sccSubNondeterministicChoiceIndices.at(0) = 0; |
|||
|
|||
// Pre-process all dependent states
|
|||
// Remove outgoing transitions and create the ChoiceIndices
|
|||
uint_fast64_t innerIndex = 0; |
|||
uint_fast64_t outerIndex = 0; |
|||
for (uint_fast64_t state : scc) { |
|||
// Choice Indices
|
|||
sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); |
|||
|
|||
for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { |
|||
typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt); |
|||
for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { |
|||
if (!subMatrixIndices.get(rowIt->getColumn())) { |
|||
// This is an outgoing transition of a state in the SCC to a state not included in the SCC
|
|||
// Subtracting Pr(tau) * x_other from b fixes that
|
|||
sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); |
|||
} |
|||
} |
|||
++innerIndex; |
|||
} |
|||
++outerIndex; |
|||
} |
|||
|
|||
// For the current SCC, we need to perform value iteration until convergence.
|
|||
if (useGpu) { |
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
if (!resetCudaDevice()) { |
|||
LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); |
|||
throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; |
|||
} |
|||
|
|||
//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
|
|||
//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
|
|||
//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
|
|||
|
|||
bool result = false; |
|||
localIterations = 0; |
|||
if (minimize) { |
|||
result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); |
|||
} else { |
|||
result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); |
|||
} |
|||
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); |
|||
|
|||
if (!result) { |
|||
converged = false; |
|||
LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); |
|||
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; |
|||
} else { |
|||
converged = true; |
|||
} |
|||
|
|||
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
|
|||
// track of the maximum.
|
|||
if (localIterations > currentMaxLocalIterations) { |
|||
currentMaxLocalIterations = localIterations; |
|||
} |
|||
globalIterations += localIterations; |
|||
#else
|
|||
LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); |
|||
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; |
|||
#endif
|
|||
} else { |
|||
std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl; |
|||
localIterations = 0; |
|||
converged = false; |
|||
while (!converged && localIterations < this->maximalNumberOfIterations) { |
|||
// Compute x' = A*x + b.
|
|||
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); |
|||
storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB); |
|||
|
|||
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
|
|||
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
|
|||
|
|||
/*
|
|||
Versus: |
|||
A.multiplyWithVector(*currentX, *multiplyResult); |
|||
storm::utility::vector::addVectorsInPlace(*multiplyResult, b); |
|||
*/ |
|||
|
|||
// Reduce the vector x' by applying min/max for all non-deterministic choices.
|
|||
if (minimize) { |
|||
storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); |
|||
} else { |
|||
storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); |
|||
} |
|||
|
|||
// Determine whether the method converged.
|
|||
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
|
|||
// running time. In fact, it is faster. This has to be investigated.
|
|||
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
|
|||
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative); |
|||
|
|||
// Update environment variables.
|
|||
std::swap(currentX, swap); |
|||
|
|||
++localIterations; |
|||
++globalIterations; |
|||
} |
|||
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); |
|||
} |
|||
|
|||
|
|||
// The Result of this SCC has to be taken back into the main result vector
|
|||
innerIndex = 0; |
|||
for (uint_fast64_t state : scc) { |
|||
x.at(state) = currentX->at(innerIndex); |
|||
++innerIndex; |
|||
} |
|||
|
|||
// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
|
|||
currentX = nullptr; |
|||
swap = nullptr; |
|||
|
|||
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
|
|||
// track of the maximum.
|
|||
if (localIterations > currentMaxLocalIterations) { |
|||
currentMaxLocalIterations = localIterations; |
|||
} |
|||
} |
|||
|
|||
std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl; |
|||
|
|||
// Check if the solver converged and issue a warning otherwise.
|
|||
if (converged) { |
|||
LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); |
|||
} else { |
|||
LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); |
|||
} |
|||
|
|||
std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now(); |
|||
std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl; |
|||
} |
|||
} |
|||
|
|||
template<typename ValueType> |
|||
std::vector<std::pair<bool, storm::storage::StateBlock>> |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const { |
|||
std::vector<std::pair<bool, storm::storage::StateBlock>> result; |
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
// 95% to have a bit of padding
|
|||
size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95); |
|||
size_t lastResultIndex = 0; |
|||
|
|||
std::vector<uint_fast64_t> const& rowGroupIndices = matrix.getRowGroupIndices(); |
|||
|
|||
size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(matrix.getRowCount()), rowGroupIndices.size(), static_cast<size_t>(matrix.getEntryCount())); |
|||
size_t const gpuSizePerRowGroup = std::max(static_cast<size_t>(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast<size_t>(1)); |
|||
size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; |
|||
|
|||
size_t currentSize = 0; |
|||
size_t neededReserveSize = 0; |
|||
size_t startIndex = 0; |
|||
for (size_t i = 0; i < topologicalSort.size(); ++i) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; |
|||
size_t const currentSccSize = scc.size(); |
|||
|
|||
uint_fast64_t rowCount = 0; |
|||
uint_fast64_t entryCount = 0; |
|||
|
|||
for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { |
|||
rowCount += matrix.getRowGroupSize(*sccIt); |
|||
entryCount += matrix.getRowGroupEntryCount(*sccIt); |
|||
} |
|||
|
|||
size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(rowCount), scc.size(), static_cast<size_t>(entryCount)); |
|||
|
|||
if ((currentSize + sccSize) <= cudaFreeMemory) { |
|||
// There is enough space left in the current group
|
|||
neededReserveSize += currentSccSize; |
|||
currentSize += sccSize; |
|||
} else { |
|||
// This would make the last open group to big for the GPU
|
|||
|
|||
if (startIndex < i) { |
|||
if ((startIndex + 1) < i) { |
|||
// More than one component
|
|||
std::vector<uint_fast64_t> tempGroups; |
|||
tempGroups.reserve(neededReserveSize); |
|||
|
|||
// Copy the first group to make inplace_merge possible
|
|||
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; |
|||
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); |
|||
|
|||
if (((startIndex + 1) + 80) >= i) { |
|||
size_t lastSize = 0; |
|||
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; |
|||
lastSize = tempGroups.size(); |
|||
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); |
|||
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin(); |
|||
std::advance(middleIterator, lastSize); |
|||
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); |
|||
} |
|||
} else { |
|||
// Use std::sort
|
|||
for (size_t j = startIndex + 1; j < i; ++j) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; |
|||
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); |
|||
} |
|||
std::sort(tempGroups.begin(), tempGroups.end()); |
|||
} |
|||
result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); |
|||
} else { |
|||
// Only one group, copy construct.
|
|||
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); |
|||
} |
|||
++lastResultIndex; |
|||
} |
|||
|
|||
if (sccSize <= cudaFreeMemory) { |
|||
currentSize = sccSize; |
|||
neededReserveSize = currentSccSize; |
|||
startIndex = i; |
|||
} else { |
|||
// This group is too big to fit into the CUDA Memory by itself
|
|||
result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); |
|||
++lastResultIndex; |
|||
|
|||
currentSize = 0; |
|||
neededReserveSize = 0; |
|||
startIndex = i + 1; |
|||
} |
|||
} |
|||
} |
|||
|
|||
size_t const topologicalSortSize = topologicalSort.size(); |
|||
if (startIndex < topologicalSortSize) { |
|||
if ((startIndex + 1) < topologicalSortSize) { |
|||
// More than one component
|
|||
std::vector<uint_fast64_t> tempGroups; |
|||
tempGroups.reserve(neededReserveSize); |
|||
|
|||
// Copy the first group to make inplace_merge possible
|
|||
storm::storage::StateBlock const& scc_first = sccDecomposition[topologicalSort[startIndex]]; |
|||
tempGroups.insert(tempGroups.cend(), scc_first.cbegin(), scc_first.cend()); |
|||
|
|||
// For set counts <= 80, Inplace Merge is faster
|
|||
if (((startIndex + 1) + 80) >= topologicalSortSize) { |
|||
size_t lastSize = 0; |
|||
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; |
|||
lastSize = tempGroups.size(); |
|||
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); |
|||
std::vector<uint_fast64_t>::iterator middleIterator = tempGroups.begin(); |
|||
std::advance(middleIterator, lastSize); |
|||
std::inplace_merge(tempGroups.begin(), middleIterator, tempGroups.end()); |
|||
} |
|||
} else { |
|||
// Use std::sort
|
|||
for (size_t j = startIndex + 1; j < topologicalSort.size(); ++j) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; |
|||
tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); |
|||
} |
|||
std::sort(tempGroups.begin(), tempGroups.end()); |
|||
} |
|||
result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); |
|||
} |
|||
else { |
|||
// Only one group, copy construct.
|
|||
result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); |
|||
} |
|||
++lastResultIndex; |
|||
} |
|||
#else
|
|||
for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { |
|||
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; |
|||
result.push_back(std::make_pair(false, scc)); |
|||
} |
|||
#endif
|
|||
return result; |
|||
} |
|||
|
|||
// Explicitly instantiate the solver.
|
|||
template class TopologicalValueIterationNondeterministicLinearEquationSolver<double>; |
|||
template class TopologicalValueIterationNondeterministicLinearEquationSolver<float>; |
|||
} // namespace solver
|
|||
} // namespace storm
|
@ -0,0 +1,97 @@ |
|||
#ifndef STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ |
|||
#define STORM_SOLVER_TOPOLOGICALVALUEITERATIONNONDETERMINISTICLINEAREQUATIONSOLVER_H_ |
|||
|
|||
#include "src/solver/NativeNondeterministicLinearEquationSolver.h" |
|||
#include "src/storage/StronglyConnectedComponentDecomposition.h" |
|||
#include "src/storage/SparseMatrix.h" |
|||
|
|||
#include <utility> |
|||
#include <vector> |
|||
|
|||
#include "storm-config.h" |
|||
#ifdef STORM_HAVE_CUDAFORSTORM |
|||
# include "cudaForStorm.h" |
|||
#endif |
|||
|
|||
namespace storm { |
|||
namespace solver { |
|||
|
|||
/*! |
|||
* A class that uses SCC Decompositions to solve a linear equation system |
|||
*/ |
|||
template<class ValueType> |
|||
class TopologicalValueIterationNondeterministicLinearEquationSolver : public NativeNondeterministicLinearEquationSolver<ValueType> { |
|||
public: |
|||
/*! |
|||
* Constructs a nondeterministic linear equation solver with parameters being set according to the settings |
|||
* object. |
|||
*/ |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver(); |
|||
|
|||
/*! |
|||
* Constructs a nondeterminstic linear equation solver with the given parameters. |
|||
* |
|||
* @param precision The precision to use for convergence detection. |
|||
* @param maximalNumberOfIterations The maximal number of iterations do perform before iteration is aborted. |
|||
* @param relative If set, the relative error rather than the absolute error is considered for convergence |
|||
* detection. |
|||
*/ |
|||
TopologicalValueIterationNondeterministicLinearEquationSolver(double precision, uint_fast64_t maximalNumberOfIterations, bool relative = true); |
|||
|
|||
virtual NondeterministicLinearEquationSolver<ValueType>* clone() const override; |
|||
|
|||
virtual void solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult = nullptr, std::vector<ValueType>* newX = nullptr) const override; |
|||
private: |
|||
/*! |
|||
* Given a topological sort of a SCC Decomposition, this will calculate the optimal grouping of SCCs with respect to the size of the GPU memory. |
|||
*/ |
|||
std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const; |
|||
}; |
|||
|
|||
template <typename ValueType> |
|||
bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
// |
|||
throw; |
|||
} |
|||
template <> |
|||
inline bool __basicValueIteration_mvReduce_uint64_minimize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
#ifdef STORM_HAVE_CUDAFORSTORM |
|||
return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
#else |
|||
throw; |
|||
#endif |
|||
} |
|||
template <> |
|||
inline bool __basicValueIteration_mvReduce_uint64_minimize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
#ifdef STORM_HAVE_CUDAFORSTORM |
|||
return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
#else |
|||
throw; |
|||
#endif |
|||
} |
|||
|
|||
template <typename ValueType> |
|||
bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
// |
|||
throw; |
|||
} |
|||
template <> |
|||
inline bool __basicValueIteration_mvReduce_uint64_maximize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
#ifdef STORM_HAVE_CUDAFORSTORM |
|||
return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
#else |
|||
throw; |
|||
#endif |
|||
} |
|||
template <> |
|||
inline bool __basicValueIteration_mvReduce_uint64_maximize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
|||
#ifdef STORM_HAVE_CUDAFORSTORM |
|||
return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
|||
#else |
|||
throw; |
|||
#endif |
|||
} |
|||
} // namespace solver |
|||
} // namespace storm |
|||
|
|||
#endif /* STORM_SOLVER_NATIVENONDETERMINISTICLINEAREQUATIONSOLVER_H_ */ |
@ -0,0 +1,240 @@ |
|||
#include "gtest/gtest.h"
|
|||
#include "storm-config.h"
|
|||
|
|||
#include "src/solver/NativeNondeterministicLinearEquationSolver.h"
|
|||
#include "src/settings/Settings.h"
|
|||
#include "src/modelchecker/prctl/SparseMdpPrctlModelChecker.h"
|
|||
#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h"
|
|||
#include "src/parser/AutoParser.h"
|
|||
|
|||
#include "storm-config.h"
|
|||
|
|||
TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Dice) { |
|||
storm::settings::Settings* s = storm::settings::Settings::getInstance(); |
|||
std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as<storm::models::Mdp<double>>(); |
|||
|
|||
ASSERT_EQ(mdp->getNumberOfStates(), 169ull); |
|||
ASSERT_EQ(mdp->getNumberOfTransitions(), 436ull); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp); |
|||
|
|||
storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("two"); |
|||
storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
std::vector<double> result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("two"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0277777612209320068), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("three"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("three"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0555555224418640136), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("four"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("four"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.083333283662796020508), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula);; |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
std::shared_ptr<storm::models::Mdp<double>> stateRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", "")->as<storm::models::Mdp<double>>(); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateRewardModelChecker(*stateRewardMdp); |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 7.33332904), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = stateRewardModelChecker.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 7.333329499), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 7.33333151), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
std::shared_ptr<storm::models::Mdp<double>> stateAndTransitionRewardMdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.tra", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.lab", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.state.rew", STORM_CPP_BASE_PATH "/examples/mdp/two_dice/two_dice.flip.trans.rew")->as<storm::models::Mdp<double>>(); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> stateAndTransitionRewardModelChecker(*stateAndTransitionRewardMdp); |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 14.6666581), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("done"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = stateAndTransitionRewardModelChecker.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 14.666658998), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 14.666663), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
} |
|||
|
|||
TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { |
|||
storm::settings::Settings* s = storm::settings::Settings::getInstance(); |
|||
std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader4.trans.rew")->as<storm::models::Mdp<double>>(); |
|||
|
|||
ASSERT_EQ(mdp->getNumberOfStates(), 3172ull); |
|||
ASSERT_EQ(mdp->getNumberOfTransitions(), 7144ull); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp); |
|||
|
|||
storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
std::vector<double> result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 1), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0625), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
|
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 4.285701547), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
ASSERT_LT(std::abs(result[0] - 4.285689611), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#else
|
|||
ASSERT_LT(std::abs(result[0] - 4.285703591), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
#endif
|
|||
delete rewardFormula; |
|||
} |
@ -0,0 +1,354 @@ |
|||
#include "gtest/gtest.h"
|
|||
#include "src/storage/SparseMatrix.h"
|
|||
#include "src/exceptions/InvalidStateException.h"
|
|||
#include "src/exceptions/OutOfRangeException.h"
|
|||
|
|||
#include "storm-config.h"
|
|||
|
|||
#ifdef STORM_HAVE_CUDAFORSTORM
|
|||
|
|||
#include "cudaForStorm.h"
|
|||
|
|||
TEST(CudaPlugin, SpMV_4x4) { |
|||
storm::storage::SparseMatrixBuilder<double> matrixBuilder(4, 4, 10); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0)); |
|||
|
|||
|
|||
storm::storage::SparseMatrix<double> matrix; |
|||
ASSERT_NO_THROW(matrix = matrixBuilder.build()); |
|||
|
|||
ASSERT_EQ(4, matrix.getRowCount()); |
|||
ASSERT_EQ(4, matrix.getColumnCount()); |
|||
ASSERT_EQ(10, matrix.getEntryCount()); |
|||
|
|||
std::vector<double> x({0, 4, 1, 1}); |
|||
std::vector<double> b({0, 0, 0, 0}); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); |
|||
|
|||
ASSERT_EQ(b.at(0), 3); |
|||
ASSERT_EQ(b.at(1), 25); |
|||
ASSERT_EQ(b.at(2), 16); |
|||
ASSERT_EQ(b.at(3), 0); |
|||
} |
|||
|
|||
TEST(CudaPlugin, SpMV_4x4_float) { |
|||
storm::storage::SparseMatrixBuilder<float> matrixBuilder(4, 4, 10); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f)); |
|||
|
|||
|
|||
storm::storage::SparseMatrix<float> matrix; |
|||
ASSERT_NO_THROW(matrix = matrixBuilder.build()); |
|||
|
|||
ASSERT_EQ(4, matrix.getRowCount()); |
|||
ASSERT_EQ(4, matrix.getColumnCount()); |
|||
ASSERT_EQ(10, matrix.getEntryCount()); |
|||
|
|||
std::vector<float> x({ 0.f, 4.f, 1.f, 1.f }); |
|||
std::vector<float> b({ 0.f, 0.f, 0.f, 0.f }); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); |
|||
|
|||
ASSERT_EQ(b.at(0), 3); |
|||
ASSERT_EQ(b.at(1), 25); |
|||
ASSERT_EQ(b.at(2), 16); |
|||
ASSERT_EQ(b.at(3), 0); |
|||
} |
|||
|
|||
TEST(CudaPlugin, SpMV_VerySmall) { |
|||
storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); |
|||
|
|||
storm::storage::SparseMatrix<double> matrix; |
|||
ASSERT_NO_THROW(matrix = matrixBuilder.build()); |
|||
|
|||
ASSERT_EQ(2, matrix.getRowCount()); |
|||
ASSERT_EQ(2, matrix.getColumnCount()); |
|||
ASSERT_EQ(2, matrix.getEntryCount()); |
|||
|
|||
std::vector<double> x({ 4.0, 8.0 }); |
|||
std::vector<double> b({ 0.0, 0.0 }); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_double(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); |
|||
|
|||
ASSERT_EQ(b.at(0), 4.0); |
|||
ASSERT_EQ(b.at(1), 16.0); |
|||
} |
|||
|
|||
TEST(CudaPlugin, SpMV_VerySmall_float) { |
|||
storm::storage::SparseMatrixBuilder<float> matrixBuilder(2, 2, 2); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); |
|||
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); |
|||
|
|||
storm::storage::SparseMatrix<float> matrix; |
|||
ASSERT_NO_THROW(matrix = matrixBuilder.build()); |
|||
|
|||
ASSERT_EQ(2, matrix.getRowCount()); |
|||
ASSERT_EQ(2, matrix.getColumnCount()); |
|||
ASSERT_EQ(2, matrix.getEntryCount()); |
|||
|
|||
std::vector<float> x({ 4.0, 8.0 }); |
|||
std::vector<float> b({ 0.0, 0.0 }); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); |
|||
|
|||
ASSERT_EQ(b.at(0), 4.0); |
|||
ASSERT_EQ(b.at(1), 16.0); |
|||
} |
|||
|
|||
TEST(CudaPlugin, AddVectorsInplace) { |
|||
std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; |
|||
std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; |
|||
std::vector<double> vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; |
|||
std::vector<double> vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; |
|||
std::vector<double> vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 }; |
|||
|
|||
ASSERT_EQ(vectorA_1.size(), 8); |
|||
ASSERT_EQ(vectorA_2.size(), 8); |
|||
ASSERT_EQ(vectorA_3.size(), 8); |
|||
ASSERT_EQ(vectorB.size(), 8); |
|||
ASSERT_EQ(vectorC.size(), 8); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB)); |
|||
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC)); |
|||
|
|||
ASSERT_EQ(vectorA_1.size(), 8); |
|||
ASSERT_EQ(vectorA_2.size(), 8); |
|||
ASSERT_EQ(vectorA_3.size(), 8); |
|||
ASSERT_EQ(vectorB.size(), 8); |
|||
ASSERT_EQ(vectorC.size(), 8); |
|||
|
|||
for (size_t i = 0; i < vectorA_3.size(); ++i) { |
|||
double cpu_result_b = vectorA_3.at(i) + vectorB.at(i); |
|||
double cpu_result_c = vectorA_3.at(i) + vectorC.at(i); |
|||
|
|||
ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); |
|||
ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); |
|||
} |
|||
} |
|||
|
|||
TEST(CudaPlugin, AddVectorsInplace_float) { |
|||
std::vector<float> vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; |
|||
std::vector<float> vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; |
|||
std::vector<float> vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; |
|||
std::vector<float> vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; |
|||
std::vector<float> vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f }; |
|||
|
|||
ASSERT_EQ(vectorA_1.size(), 8); |
|||
ASSERT_EQ(vectorA_2.size(), 8); |
|||
ASSERT_EQ(vectorA_3.size(), 8); |
|||
ASSERT_EQ(vectorB.size(), 8); |
|||
ASSERT_EQ(vectorC.size(), 8); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB)); |
|||
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC)); |
|||
|
|||
ASSERT_EQ(vectorA_1.size(), 8); |
|||
ASSERT_EQ(vectorA_2.size(), 8); |
|||
ASSERT_EQ(vectorA_3.size(), 8); |
|||
ASSERT_EQ(vectorB.size(), 8); |
|||
ASSERT_EQ(vectorC.size(), 8); |
|||
|
|||
for (size_t i = 0; i < vectorA_3.size(); ++i) { |
|||
float cpu_result_b = vectorA_3.at(i) + vectorB.at(i); |
|||
float cpu_result_c = vectorA_3.at(i) + vectorC.at(i); |
|||
|
|||
ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); |
|||
ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); |
|||
} |
|||
} |
|||
|
|||
TEST(CudaPlugin, ReduceGroupedVector) { |
|||
std::vector<double> groupedVector = { |
|||
0.0, -1000.0, 0.000004, // Group 0
|
|||
5.0, // Group 1
|
|||
0.0, 1.0, 2.0, 3.0, // Group 2
|
|||
-1000.0, -3.14, -0.0002,// Group 3 (neg only)
|
|||
25.25, 25.25, 25.25, // Group 4
|
|||
0.0, 0.0, 1.0, // Group 5
|
|||
-0.000001, 0.000001 // Group 6
|
|||
}; |
|||
std::vector<uint_fast64_t> grouping = { |
|||
0, 3, 4, 8, 11, 14, 17, 19 |
|||
}; |
|||
|
|||
std::vector<double> result_minimize = { |
|||
-1000.0, // Group 0
|
|||
5.0, |
|||
0.0, |
|||
-1000.0, |
|||
25.25, |
|||
0.0, |
|||
-0.000001 |
|||
}; |
|||
std::vector<double> result_maximize = { |
|||
0.000004, |
|||
5.0, |
|||
3.0, |
|||
-0.0002, |
|||
25.25, |
|||
1.0, |
|||
0.000001 |
|||
}; |
|||
|
|||
std::vector<double> result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; |
|||
std::vector<double> result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize)); |
|||
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize)); |
|||
|
|||
for (size_t i = 0; i < result_minimize.size(); ++i) { |
|||
ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); |
|||
ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); |
|||
} |
|||
} |
|||
|
|||
TEST(CudaPlugin, ReduceGroupedVector_float) { |
|||
std::vector<float> groupedVector = { |
|||
0.0f, -1000.0f, 0.000004f, // Group 0
|
|||
5.0f, // Group 1
|
|||
0.0f, 1.0f, 2.0f, 3.0f, // Group 2
|
|||
-1000.0f, -3.14f, -0.0002f,// Group 3 (neg only)
|
|||
25.25f, 25.25f, 25.25f, // Group 4
|
|||
0.0f, 0.0f, 1.0f, // Group 5
|
|||
-0.000001f, 0.000001f // Group 6
|
|||
}; |
|||
std::vector<uint_fast64_t> grouping = { |
|||
0, 3, 4, 8, 11, 14, 17, 19 |
|||
}; |
|||
|
|||
std::vector<float> result_minimize = { |
|||
-1000.0f, // Group 0
|
|||
5.0f, |
|||
0.0f, |
|||
-1000.0f, |
|||
25.25f, |
|||
0.0f, |
|||
-0.000001f |
|||
}; |
|||
std::vector<float> result_maximize = { |
|||
0.000004f, |
|||
5.0f, |
|||
3.0f, |
|||
-0.0002f, |
|||
25.25f, |
|||
1.0f, |
|||
0.000001f |
|||
}; |
|||
|
|||
std::vector<float> result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; |
|||
std::vector<float> result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize)); |
|||
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize)); |
|||
|
|||
for (size_t i = 0; i < result_minimize.size(); ++i) { |
|||
ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); |
|||
ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); |
|||
} |
|||
} |
|||
|
|||
TEST(CudaPlugin, equalModuloPrecision) { |
|||
std::vector<double> x = { |
|||
123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12 |
|||
}; |
|||
std::vector<double> y1 = { |
|||
0.45, 0.8, 0.23, 0.012, 0.456789, -0.12 |
|||
}; |
|||
std::vector<double> y2 = { |
|||
0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12 |
|||
}; |
|||
std::vector<double> x2; |
|||
std::vector<double> x3; |
|||
std::vector<double> y3; |
|||
std::vector<double> y4; |
|||
x2.reserve(1000); |
|||
x3.reserve(1000); |
|||
y3.reserve(1000); |
|||
y4.reserve(1000); |
|||
for (size_t i = 0; i < 1000; ++i) { |
|||
x2.push_back(static_cast<double>(i)); |
|||
y3.push_back(1.0); |
|||
x3.push_back(-(1000.0 - static_cast<double>(i))); |
|||
y4.push_back(1.0); |
|||
} |
|||
|
|||
double maxElement1 = 0.0; |
|||
double maxElement2 = 0.0; |
|||
double maxElement3 = 0.0; |
|||
double maxElement4 = 0.0; |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); |
|||
|
|||
ASSERT_DOUBLE_EQ(4567890.0, maxElement1); |
|||
ASSERT_DOUBLE_EQ(901.0, maxElement2); |
|||
|
|||
ASSERT_DOUBLE_EQ(998.0, maxElement3); |
|||
ASSERT_DOUBLE_EQ(1001.0, maxElement4); |
|||
} |
|||
|
|||
TEST(CudaPlugin, equalModuloPrecision_float) { |
|||
std::vector<float> x = { |
|||
123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f |
|||
}; |
|||
std::vector<float> y1 = { |
|||
0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f |
|||
}; |
|||
std::vector<float> y2 = { |
|||
0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f |
|||
}; |
|||
std::vector<float> x2; |
|||
std::vector<float> x3; |
|||
std::vector<float> y3; |
|||
std::vector<float> y4; |
|||
x2.reserve(1000); |
|||
x3.reserve(1000); |
|||
y3.reserve(1000); |
|||
y4.reserve(1000); |
|||
for (size_t i = 0; i < 1000; ++i) { |
|||
x2.push_back(static_cast<float>(i)); |
|||
y3.push_back(1.0f); |
|||
x3.push_back(-(1000.0f - static_cast<float>(i))); |
|||
y4.push_back(1.0f); |
|||
} |
|||
|
|||
float maxElement1 = 0.0f; |
|||
float maxElement2 = 0.0f; |
|||
float maxElement3 = 0.0f; |
|||
float maxElement4 = 0.0f; |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1)); |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2)); |
|||
|
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3)); |
|||
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4)); |
|||
|
|||
ASSERT_DOUBLE_EQ(4567890.0f, maxElement1); |
|||
ASSERT_DOUBLE_EQ(901.0f, maxElement2); |
|||
|
|||
ASSERT_DOUBLE_EQ(998.0f, maxElement3); |
|||
ASSERT_DOUBLE_EQ(1001.0f, maxElement4); |
|||
} |
|||
|
|||
#endif
|
@ -0,0 +1,163 @@ |
|||
#include "gtest/gtest.h"
|
|||
#include "storm-config.h"
|
|||
|
|||
#include "src/settings/Settings.h"
|
|||
#include "src/modelchecker/prctl/TopologicalValueIterationMdpPrctlModelChecker.h"
|
|||
#include "src/solver/NativeNondeterministicLinearEquationSolver.h"
|
|||
#include "src/parser/AutoParser.h"
|
|||
|
|||
TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, AsynchronousLeader) { |
|||
storm::settings::Settings* s = storm::settings::Settings::getInstance(); |
|||
std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.tra", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.lab", "", STORM_CPP_BASE_PATH "/examples/mdp/asynchronous_leader/leader7.trans.rew")->as<storm::models::Mdp<double>>(); |
|||
|
|||
ASSERT_EQ(mdp->getNumberOfStates(), 2095783ull); |
|||
ASSERT_EQ(mdp->getNumberOfTransitions(), 7714385ull); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp); |
|||
|
|||
storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
std::vector<double> result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 25); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 6.172433512), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("elected"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[0] - 6.1724344), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete rewardFormula; |
|||
} |
|||
|
|||
TEST(TopologicalValueIterationMdpPrctlModelCheckerTest, Consensus) { |
|||
storm::settings::Settings* s = storm::settings::Settings::getInstance(); |
|||
// Increase the maximal number of iterations, because the solver does not converge otherwise.
|
|||
// This is done in the main cpp unit
|
|||
|
|||
std::shared_ptr<storm::models::Mdp<double>> mdp = storm::parser::AutoParser::parseModel(STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.tra", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.lab", STORM_CPP_BASE_PATH "/examples/mdp/consensus/coin4_6.steps.state.rew", "")->as<storm::models::Mdp<double>>(); |
|||
|
|||
ASSERT_EQ(mdp->getNumberOfStates(), 63616ull); |
|||
ASSERT_EQ(mdp->getNumberOfTransitions(), 213472ull); |
|||
|
|||
storm::modelchecker::prctl::TopologicalValueIterationMdpPrctlModelChecker<double> mc(*mdp); |
|||
|
|||
storm::property::prctl::Ap<double>* apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
storm::property::prctl::Eventually<double>* eventuallyFormula = new storm::property::prctl::Eventually<double>(apFormula); |
|||
storm::property::prctl::ProbabilisticNoBoundOperator<double>* probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
std::vector<double> result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 1.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
storm::property::prctl::Ap<double>* apFormula2 = new storm::property::prctl::Ap<double>("all_coins_equal_0"); |
|||
storm::property::prctl::And<double>* andFormula = new storm::property::prctl::And<double>(apFormula, apFormula2); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 0.4374282832), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
apFormula2 = new storm::property::prctl::Ap<double>("all_coins_equal_1"); |
|||
andFormula = new storm::property::prctl::And<double>(apFormula, apFormula2); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 0.5293286369), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
apFormula2 = new storm::property::prctl::Ap<double>("agree"); |
|||
storm::property::prctl::Not<double>* notFormula = new storm::property::prctl::Not<double>(apFormula2); |
|||
andFormula = new storm::property::prctl::And<double>(apFormula, notFormula); |
|||
eventuallyFormula = new storm::property::prctl::Eventually<double>(andFormula); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(eventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 0.10414097), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
storm::property::prctl::BoundedEventually<double>* boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 50ull); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
boundedEventuallyFormula = new storm::property::prctl::BoundedEventually<double>(apFormula, 50ull); |
|||
probFormula = new storm::property::prctl::ProbabilisticNoBoundOperator<double>(boundedEventuallyFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*probFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 0.0), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete probFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
storm::property::prctl::ReachabilityReward<double>* reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
storm::property::prctl::RewardNoBoundOperator<double>* rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, true); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 1725.593313), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete rewardFormula; |
|||
|
|||
apFormula = new storm::property::prctl::Ap<double>("finished"); |
|||
reachabilityRewardFormula = new storm::property::prctl::ReachabilityReward<double>(apFormula); |
|||
rewardFormula = new storm::property::prctl::RewardNoBoundOperator<double>(reachabilityRewardFormula, false); |
|||
|
|||
result = mc.checkNoBoundOperator(*rewardFormula); |
|||
|
|||
ASSERT_LT(std::abs(result[31168] - 2183.142422), s->getOptionByLongName("precision").getArgument(0).getValueAsDouble()); |
|||
delete rewardFormula; |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue