Browse Source

Fixed include directories for CUDA Plugin in CMakeLists.txt

Refactored all code related to the SPMV kernels to work with float.
Wrote a test that determines whether the compiler uses 64bit boundary alignments on std::pairs of uint64 and float.
Introduced functions that allow for conversions between different ValueTypes (e.g. from float to double and backwards).


Former-commit-id: 830d24064f
tempestpy_adaptions
PBerger 10 years ago
parent
commit
ea427fcde1
  1. 2
      CMakeLists.txt
  2. 31
      resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp
  3. 18
      resources/cudaForStorm/CMakeLists.txt
  4. 143
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  5. 12
      resources/cudaForStorm/srcCuda/basicValueIteration.h
  6. 353
      resources/cudaForStorm/srcCuda/cuspExtension.h
  7. 361
      resources/cudaForStorm/srcCuda/cuspExtensionDouble.h
  8. 375
      resources/cudaForStorm/srcCuda/cuspExtensionFloat.h
  9. 4
      resources/cudaForStorm/storm-cudaplugin-config.h.in
  10. 3
      src/models/PseudoModel.cpp
  11. 3
      src/solver/NativeNondeterministicLinearEquationSolver.cpp
  12. 198
      src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
  13. 4
      src/storage/SparseMatrix.cpp
  14. 17
      src/storage/SparseMatrix.h
  15. 1
      src/storage/StronglyConnectedComponentDecomposition.cpp
  16. 14
      src/utility/vector.h
  17. 193
      test/functional/solver/CudaPluginTest.cpp

2
CMakeLists.txt

@ -320,7 +320,7 @@ if (ENABLE_Z3)
link_directories("${Z3_ROOT}/bin")
endif()
if (STORM_USE_CUDAFORSTORM)
link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib")
link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib")
endif()
if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL ""))
set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib")

31
resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp

@ -0,0 +1,31 @@
/*
* This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries
*/
#include <cstdint>
#include <utility>
#include <vector>
#define CONTAINER_SIZE 100ul
int main(int argc, char* argv[]) {
int result = 0;
std::vector<std::pair<uint_fast64_t, float>> myVector;
for (size_t i = 0; i < CONTAINER_SIZE; ++i) {
myVector.push_back(std::make_pair(i, 42.12345f * i));
}
char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first));
char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first));
ptrdiff_t uintDiff = secondUintPointer - firstUintPointer;
if (uintDiff == (2 * sizeof(uint_fast64_t))) {
result = 2;
} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) {
result = 3;
} else {
result = -5;
}
return result;
}

18
resources/cudaForStorm/CMakeLists.txt

@ -131,6 +131,24 @@ else()
message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
endif()
# Test for Float 64bit Alignment
try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT
${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp"
COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
)
if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT)
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2)
message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.")
set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define")
elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3)
message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.")
set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef")
else()
message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})")
endif()
#
# Make a version file containing the current version from git.
#

143
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -10,20 +10,15 @@
#include "utility.h"
#include "cuspExtension.h"
#include <thrust/transform.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#include "storm-cudaplugin-config.h"
#ifdef DEBUG
#define CUDA_CHECK_ALL_ERRORS() do { \
cudaError_t errSync = cudaGetLastError(); \
cudaError_t errAsync = cudaDeviceSynchronize(); \
if (errSync != cudaSuccess) { \
std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \
} \
if (errAsync != cudaSuccess) { \
std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
} } while(false)
#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
#else
#define CUDA_CHECK_ALL_ERRORS() do {} while (false)
#endif
@ -56,15 +51,16 @@ void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVect
}
}
// TEMPLATE VERSION
template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
//std::vector<IndexType> matrixColumnIndices;
//std::vector<ValueType> matrixValues;
//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
bool errorOccured = false;
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr;
@ -74,7 +70,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
#ifdef DEBUG
std::cout.sync_with_stdio(true);
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
#endif
@ -96,6 +92,20 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
goto cleanup;
}
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
#else
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
#endif
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
errorOccured = true;
goto cleanup;
}
} else {
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
@ -103,6 +113,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
errorOccured = true;
goto cleanup;
}
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
@ -159,6 +170,16 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
goto cleanup;
}
// Copy all data as floats are expanded to 64bits :/
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
errorOccured = true;
goto cleanup;
}
} else {
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
@ -166,6 +187,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
errorOccured = true;
goto cleanup;
}
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
@ -214,9 +236,8 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
#endif
// Data is on device, start Kernel
while (!converged && iterationCount < maxIterationCount)
{ // In a sub-area since transfer of control via label evades initialization
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -227,7 +248,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
CUDA_CHECK_ALL_ERRORS();
// Reduce: Reduce multiplyResult to a new x vector
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
// Check for convergence
@ -338,7 +359,7 @@ cleanup:
template <typename IndexType, typename ValueType>
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_multiplyResult = nullptr;
@ -362,12 +383,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup;
}
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
#else
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
#endif
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
@ -397,12 +427,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup;
}
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
#else
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
#endif
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
@ -423,7 +462,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
#endif
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
#ifdef DEBUG
@ -601,7 +640,7 @@ void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& group
do {
// Reduce: Reduce multiplyResult to a new x vector
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
CUDA_CHECK_ALL_ERRORS();
} while (false);
@ -713,7 +752,7 @@ cleanup:
*/
void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
}
void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
@ -736,6 +775,31 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<dou
basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
}
// Float
void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) {
basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
}
void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) {
basicValueIteration_addVectorsInplace<float>(a, b);
}
void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector);
}
void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) {
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector);
}
void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement);
}
void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) {
basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement);
}
bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
if (relativePrecisionCheck) {
return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
@ -752,6 +816,22 @@ bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max
}
}
bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
if (relativePrecisionCheck) {
return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
} else {
return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
}
}
bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
if (relativePrecisionCheck) {
return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
} else {
return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
}
}
size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
size_t const valueTypeSize = sizeof(double);
size_t const indexTypeSize = sizeof(uint_fast64_t);
@ -774,3 +854,26 @@ size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t con
return (matrixDataSize + vectorSizes);
}
size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) {
size_t const valueTypeSize = sizeof(float);
size_t const indexTypeSize = sizeof(uint_fast64_t);
/*
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr;
ValueType* device_multiplyResult = nullptr;
IndexType* device_nondeterministicChoiceIndices = nullptr;
*/
// Row Indices, Column Indices, Values, Choice Indices
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize);
// Vectors x, xSwap, b, multiplyResult
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize);
return (matrixDataSize + vectorSizes);
}

12
resources/cudaForStorm/srcCuda/basicValueIteration.h

@ -85,6 +85,11 @@ template<typename T>
cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount);
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount);
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
@ -92,4 +97,11 @@ cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_m
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b);
cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b);
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector);
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement);
#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_

353
resources/cudaForStorm/srcCuda/cuspExtension.h

@ -1,338 +1,47 @@
/*
* This is an extension of the original CUSP csr_vector.h SPMV implementation.
* It is based on the Code and incorporates changes as to cope with the details
* of the StoRM code.
* As this is mostly copy & paste, the original license still applies.
*/
/*
* Copyright 2008-2009 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cusp/detail/device/spmv/csr_vector.h>
#include <limits>
#include <algorithm>
namespace cusp
{
namespace detail
{
namespace device
{
//////////////////////////////////////////////////////////////////////////////
// CSR SpMV kernels based on a vector model (one warp per row)
//////////////////////////////////////////////////////////////////////////////
//
// spmv_csr_vector_device
// Each row of the CSR matrix is assigned to a warp. The warp computes
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
// the x vector, in parallel. This division of work implies that
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
// manner (but generally not aligned). On GT200 these accesses are
// coalesced, unlike kernels based on the one-row-per-thread division of
// work. Since an entire 32-thread warp is assigned to each row, many
// threads will remain idle when their row contains a small number
// of elements. This code relies on implicit synchronization among
// threads in a warp.
//
// spmv_csr_vector_tex_device
// Same as spmv_csr_vector_tex_device, except that the texture cache is
// used for accessing the x vector.
//
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
{
__shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
for(IndexType row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local sum
ValueType sum = 0;
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end)
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
// accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
}
else
{
// accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
}
// store local sum in shared memory
sdata[threadIdx.x] = sum;
// reduce local sums to row sum
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
// first thread writes the result
if (thread_lane == 0)
y[row] = sdata[threadIdx.x];
}
}
template <typename IndexType, typename ValueType, unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
__global__ void
storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer)
{
__shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2];
const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index
const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
for(IndexType row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local Min/Max
ValueType localMinMaxElement = minMaxInitializer;
if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// accumulate local sums
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
else
{
// accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
#include "cuspExtensionFloat.h"
#include "cuspExtensionDouble.h"
// store local sum in shared memory
sdata[threadIdx.x] = localMinMaxElement;
namespace cusp {
namespace detail {
namespace device {
// reduce local min/max to row min/max
if (Minimize) {
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
} else {
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
}
// first thread writes the result
if (thread_lane == 0)
x[row] = sdata[threadIdx.x];
}
template <typename ValueType>
void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) {
//
throw;
}
template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
{
ValueType __minMaxInitializer = -std::numeric_limits<ValueType>::max();
if (Minimize) {
__minMaxInitializer = std::numeric_limits<ValueType>::max();
}
const ValueType minMaxInitializer = __minMaxInitializer;
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
template <>
void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) {
storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
template <bool Minimize, typename IndexType, typename ValueType>
void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
{
const IndexType rows_per_group = num_entries / num_rows;
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
__storm_cuda_opt_vector_reduce<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
template <>
void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) {
storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
if (UseCache)
unbind_x(x);
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
template <bool Minimize, typename ValueType>
void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) {
//
throw;
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
template <>
void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
}
// NON-OPT
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
if (UseCache)
unbind_x(x);
template <>
void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) {
storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
}
template <typename IndexType, typename ValueType>
void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
template <>
void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
}
template <typename IndexType, typename ValueType>
void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
template <>
void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) {
storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y);
}
} // end namespace device

361
resources/cudaForStorm/srcCuda/cuspExtensionDouble.h

@ -0,0 +1,361 @@
/*
* This is an extension of the original CUSP csr_vector.h SPMV implementation.
* It is based on the Code and incorporates changes as to cope with the details
* of the StoRM code.
* As this is mostly copy & paste, the original license still applies.
*/
/*
* Copyright 2008-2009 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <limits>
#include <cstdint>
#include <algorithm>
#include <math_functions.h>
#include <cusp/detail/device/spmv/csr_vector.h>
namespace cusp
{
namespace detail
{
namespace device
{
//////////////////////////////////////////////////////////////////////////////
// CSR SpMV kernels based on a vector model (one warp per row)
//////////////////////////////////////////////////////////////////////////////
//
// spmv_csr_vector_device
// Each row of the CSR matrix is assigned to a warp. The warp computes
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
// the x vector, in parallel. This division of work implies that
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
// manner (but generally not aligned). On GT200 these accesses are
// coalesced, unlike kernels based on the one-row-per-thread division of
// work. Since an entire 32-thread warp is assigned to each row, many
// threads will remain idle when their row contains a small number
// of elements. This code relies on implicit synchronization among
// threads in a warp.
//
// spmv_csr_vector_tex_device
// Same as spmv_csr_vector_tex_device, except that the texture cache is
// used for accessing the x vector.
//
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void
storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y)
{
__shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local sum
double sum = 0;
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
}
// accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
}
} else {
// accumulate local sums
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x);
}
}
// store local sum in shared memory
sdata[threadIdx.x] = sum;
// reduce local sums to row sum
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
// first thread writes the result
if (thread_lane == 0)
y[row] = sdata[threadIdx.x];
}
}
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
__global__ void
storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer)
{
__shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local Min/Max
double localMinMaxElement = minMaxInitializer;
if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// accumulate local sums
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
else
{
// accumulate local sums
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// store local sum in shared memory
sdata[threadIdx.x] = localMinMaxElement;
// reduce local min/max to row min/max
if (Minimize) {
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]);
} else {
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
}
// first thread writes the result
if (thread_lane == 0)
x[row] = sdata[threadIdx.x];
}
}
template <bool Minimize, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
{
double __minMaxInitializer = -std::numeric_limits<double>::max();
if (Minimize) {
__minMaxInitializer = std::numeric_limits<double>::max();
}
const double minMaxInitializer = __minMaxInitializer;
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
}
template <bool Minimize>
void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y)
{
const uint_fast64_t rows_per_group = num_entries / num_rows;
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
__storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
}
template <bool UseCache, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
if (UseCache)
unbind_x(x);
}
void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
// NON-OPT
template <bool UseCache, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
if (UseCache)
unbind_x(x);
}
void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
} // end namespace device
} // end namespace detail
} // end namespace cusp

375
resources/cudaForStorm/srcCuda/cuspExtensionFloat.h

@ -0,0 +1,375 @@
/*
* This is an extension of the original CUSP csr_vector.h SPMV implementation.
* It is based on the Code and incorporates changes as to cope with the details
* of the StoRM code.
* As this is mostly copy & paste, the original license still applies.
*/
/*
* Copyright 2008-2009 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <limits>
#include <cstdint>
#include <algorithm>
#include <math_functions.h>
#include <cusp/detail/device/spmv/csr_vector.h>
#include "storm-cudaplugin-config.h"
namespace cusp
{
namespace detail
{
namespace device
{
//////////////////////////////////////////////////////////////////////////////
// CSR SpMV kernels based on a vector model (one warp per row)
//////////////////////////////////////////////////////////////////////////////
//
// spmv_csr_vector_device
// Each row of the CSR matrix is assigned to a warp. The warp computes
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
// the x vector, in parallel. This division of work implies that
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
// manner (but generally not aligned). On GT200 these accesses are
// coalesced, unlike kernels based on the one-row-per-thread division of
// work. Since an entire 32-thread warp is assigned to each row, many
// threads will remain idle when their row contains a small number
// of elements. This code relies on implicit synchronization among
// threads in a warp.
//
// spmv_csr_vector_tex_device
// Same as spmv_csr_vector_tex_device, except that the texture cache is
// used for accessing the x vector.
//
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void
storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y)
{
__shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local sum
float sum = 0;
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
#else
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
#endif
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
}
// accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) {
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
#else
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
#endif
}
} else {
// accumulate local sums
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) {
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x);
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x);
#else
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x);
#endif
}
}
// store local sum in shared memory
sdata[threadIdx.x] = sum;
// reduce local sums to row sum
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
// first thread writes the result
if (thread_lane == 0)
y[row] = sdata[threadIdx.x];
}
}
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
__global__ void
storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer)
{
__shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local Min/Max
float localMinMaxElement = minMaxInitializer;
if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// accumulate local sums
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
else
{
// accumulate local sums
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = min(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = max(localMinMaxElement, y[jj]);
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// store local sum in shared memory
sdata[threadIdx.x] = localMinMaxElement;
// reduce local min/max to row min/max
if (Minimize) {
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]);
} else {
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]);
}
// first thread writes the result
if (thread_lane == 0)
x[row] = sdata[threadIdx.x];
}
}
template <bool Minimize, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
{
float __minMaxInitializer = -std::numeric_limits<float>::max();
if (Minimize) {
__minMaxInitializer = std::numeric_limits<float>::max();
}
const float minMaxInitializer = __minMaxInitializer;
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
}
template <bool Minimize>
void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y)
{
const uint_fast64_t rows_per_group = num_entries / num_rows;
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
__storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
}
template <bool UseCache, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
if (UseCache)
unbind_x(x);
}
void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
// NON-OPT
template <bool UseCache, unsigned int THREADS_PER_VECTOR>
void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
if (UseCache)
unbind_x(x);
}
void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y)
{
const uint_fast64_t nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
} // end namespace device
} // end namespace detail
} // end namespace cusp

4
resources/cudaForStorm/storm-cudaplugin-config.h.in

@ -2,7 +2,6 @@
* StoRM - Build-in Options
*
* This file is parsed by CMake during makefile generation
* It contains information such as the base path to the test/example data
*/
#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
@ -16,4 +15,7 @@
#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on
#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else
// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit
#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_

3
src/models/PseudoModel.cpp

@ -101,6 +101,9 @@ namespace storm {
template class ModelBasedPseudoModel<double>;
template class NonDeterministicMatrixBasedPseudoModel<double>;
template class DeterministicMatrixBasedPseudoModel<double>;
template class ModelBasedPseudoModel <float> ;
template class NonDeterministicMatrixBasedPseudoModel <float>;
template class DeterministicMatrixBasedPseudoModel <float>;
template class ModelBasedPseudoModel<int>;
template class NonDeterministicMatrixBasedPseudoModel<int>;
template class DeterministicMatrixBasedPseudoModel<int>;

3
src/solver/NativeNondeterministicLinearEquationSolver.cpp

@ -76,7 +76,7 @@ namespace storm {
}
// Determine whether the method converged.
converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *newX, precision, relative);
// Update environment variables.
std::swap(currentX, newX);
@ -140,5 +140,6 @@ namespace storm {
// Explicitly instantiate the solver.
template class NativeNondeterministicLinearEquationSolver<double>;
template class NativeNondeterministicLinearEquationSolver<float>;
} // namespace solver
} // namespace storm

198
src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

@ -43,8 +43,205 @@ namespace storm {
return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this);
}
template<>
void TopologicalValueIterationNondeterministicLinearEquationSolver<float>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<float> const& A, std::vector<float>& x, std::vector<float> const& b, std::vector<float>* multiplyResult, std::vector<float>* newX) const {
// For testing only
LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker WITH FLOAT! <<<");
// Now, we need to determine the SCCs of the MDP and perform a topological sort.
std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
storm::models::NonDeterministicMatrixBasedPseudoModel<float> pseudoModel(A, nondeterministicChoiceIndices);
storm::storage::StronglyConnectedComponentDecomposition<float> sccDecomposition(pseudoModel, false, false);
if (sccDecomposition.size() == 0) {
LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
}
storm::storage::SparseMatrix<float> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
// Calculate the optimal distribution of sccs
std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
std::vector<float>* currentX = nullptr;
std::vector<float>* swap = nullptr;
size_t currentMaxLocalIterations = 0;
size_t localIterations = 0;
size_t globalIterations = 0;
bool converged = true;
// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only
// solved after all SCCs it depends on have been solved.
int counter = 0;
for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
bool const useGpu = sccIndexIt->first;
storm::storage::StateBlock const& scc = sccIndexIt->second;
// Generate a sub matrix
storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
storm::storage::SparseMatrix<float> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
std::vector<float> sccSubB(sccSubmatrix.getRowCount());
storm::utility::vector::selectVectorValues<float>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
std::vector<float> sccSubX(sccSubmatrix.getColumnCount());
std::vector<float> sccSubXSwap(sccSubmatrix.getColumnCount());
std::vector<float> sccMultiplyResult(sccSubmatrix.getRowCount());
// Prepare the pointers for swapping in the calculation
currentX = &sccSubX;
swap = &sccSubXSwap;
storm::utility::vector::selectVectorValues<float>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)
std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
sccSubNondeterministicChoiceIndices.at(0) = 0;
// Pre-process all dependent states
// Remove outgoing transitions and create the ChoiceIndices
uint_fast64_t innerIndex = 0;
uint_fast64_t outerIndex = 0;
for (uint_fast64_t state : scc) {
// Choice Indices
sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
storm::storage::SparseMatrix<float>::const_rows row = A.getRow(rowGroupIt);
for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
if (!subMatrixIndices.get(rowIt->getColumn())) {
// This is an outgoing transition of a state in the SCC to a state not included in the SCC
// Subtracting Pr(tau) * x_other from b fixes that
sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
}
}
++innerIndex;
}
++outerIndex;
}
// For the current SCC, we need to perform value iteration until convergence.
if (useGpu) {
#ifdef STORM_HAVE_CUDAFORSTORM
if (!resetCudaDevice()) {
LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
}
//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");
//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");
//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());
bool result = false;
localIterations = 0;
if (minimize) {
result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
} else {
result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
}
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
if (!result) {
converged = false;
LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
} else {
converged = true;
}
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
#else
LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
#endif
} else {
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
// Compute x' = A*x + b.
sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
storm::utility::vector::addVectorsInPlace<float>(sccMultiplyResult, sccSubB);
//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);
//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);
/*
Versus:
A.multiplyWithVector(*currentX, *multiplyResult);
storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
*/
// Reduce the vector x' by applying min/max for all non-deterministic choices.
if (minimize) {
storm::utility::vector::reduceVectorMin<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
} else {
storm::utility::vector::reduceVectorMax<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
}
// Determine whether the method converged.
// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher
// running time. In fact, it is faster. This has to be investigated.
// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);
converged = storm::utility::vector::equalModuloPrecision<float>(*currentX, *swap, this->precision, this->relative);
// Update environment variables.
std::swap(currentX, swap);
++localIterations;
++globalIterations;
}
LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
}
// The Result of this SCC has to be taken back into the main result vector
innerIndex = 0;
for (uint_fast64_t state : scc) {
x.at(state) = currentX->at(innerIndex);
++innerIndex;
}
// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
currentX = nullptr;
swap = nullptr;
// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep
// track of the maximum.
if (localIterations > currentMaxLocalIterations) {
currentMaxLocalIterations = localIterations;
}
}
// Check if the solver converged and issue a warning otherwise.
if (converged) {
LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
} else {
LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
}
}
template<typename ValueType>
void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
#ifndef GPU_USE_DOUBLE
std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
for (size_t i = 0, size = new_x.size(); i < size; ++i) {
x.at(i) = new_x.at(i);
}
#else
std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
// For testing only
LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<");
@ -224,6 +421,7 @@ namespace storm {
else {
LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
}
#endif
}
template<typename ValueType>

4
src/storage/SparseMatrix.cpp

@ -1000,6 +1000,10 @@ namespace storm {
template class SparseMatrix<int>;
template std::ostream& operator<<(std::ostream& out, SparseMatrix<int> const& matrix);
template class MatrixEntry < float > ;
template class SparseMatrixBuilder < float >;
template class SparseMatrix < float >;
template std::ostream& operator<<(std::ostream& out, SparseMatrix<float> const& matrix);
} // namespace storage
} // namespace storm

17
src/storage/SparseMatrix.h

@ -733,6 +733,23 @@ namespace storm {
* Returns a reference to the internal columnMapping vector
*/
std::vector<MatrixEntry<T>> const& __internal_getColumnsAndValues();
/*!
* Returns a copy of the matrix with the chosen internal data type
*/
template<typename NewValueType>
SparseMatrix<NewValueType> toValueType() const {
std::vector<MatrixEntry<NewValueType>> new_columnsAndValues;
std::vector<uint_fast64_t> new_rowIndications(rowIndications);
std::vector<uint_fast64_t> new_rowGroupIndices(rowGroupIndices);
new_columnsAndValues.resize(columnsAndValues.size());
for (size_t i = 0, size = columnsAndValues.size(); i < size; ++i) {
new_columnsAndValues.at(i) = MatrixEntry<NewValueType>(columnsAndValues.at(i).getColumn(), static_cast<NewValueType>(columnsAndValues.at(i).getValue()));
}
return SparseMatrix<NewValueType>(columnCount, std::move(new_rowIndications), std::move(new_columnsAndValues), std::move(new_rowGroupIndices));
}
private:
/*!
* Creates a submatrix of the current matrix by keeping only row groups and columns in the given row group

1
src/storage/StronglyConnectedComponentDecomposition.cpp

@ -223,5 +223,6 @@ namespace storm {
// Explicitly instantiate the SCC decomposition.
template class StronglyConnectedComponentDecomposition<double>;
template class StronglyConnectedComponentDecomposition<float>;
} // namespace storage
} // namespace storm

14
src/utility/vector.h

@ -422,6 +422,20 @@ namespace storm {
return subVector;
}
/*!
* Converts the given vector to the given ValueType
*/
template<typename NewValueType, typename ValueType>
std::vector<NewValueType> toValueType(std::vector<ValueType> const& oldVector) {
std::vector<NewValueType> resultVector;
resultVector.resize(oldVector.size());
for (size_t i = 0, size = oldVector.size(); i < size; ++i) {
resultVector.at(i) = static_cast<NewValueType>(oldVector.at(i));
}
return resultVector;
}
} // namespace vector
} // namespace utility
} // namespace storm

193
test/functional/solver/CudaPluginTest.cpp

@ -41,6 +41,38 @@ TEST(CudaPlugin, SpMV_4x4) {
ASSERT_EQ(b.at(3), 0);
}
TEST(CudaPlugin, SpMV_4x4_float) {
storm::storage::SparseMatrixBuilder<float> matrixBuilder(4, 4, 10);
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f));
ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f));
storm::storage::SparseMatrix<float> matrix;
ASSERT_NO_THROW(matrix = matrixBuilder.build());
ASSERT_EQ(4, matrix.getRowCount());
ASSERT_EQ(4, matrix.getColumnCount());
ASSERT_EQ(10, matrix.getEntryCount());
std::vector<float> x({ 0.f, 4.f, 1.f, 1.f });
std::vector<float> b({ 0.f, 0.f, 0.f, 0.f });
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
ASSERT_EQ(b.at(0), 3);
ASSERT_EQ(b.at(1), 25);
ASSERT_EQ(b.at(2), 16);
ASSERT_EQ(b.at(3), 0);
}
TEST(CudaPlugin, SpMV_VerySmall) {
storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2);
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
@ -62,6 +94,27 @@ TEST(CudaPlugin, SpMV_VerySmall) {
ASSERT_EQ(b.at(1), 16.0);
}
TEST(CudaPlugin, SpMV_VerySmall_float) {
storm::storage::SparseMatrixBuilder<float> matrixBuilder(2, 2, 2);
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
storm::storage::SparseMatrix<float> matrix;
ASSERT_NO_THROW(matrix = matrixBuilder.build());
ASSERT_EQ(2, matrix.getRowCount());
ASSERT_EQ(2, matrix.getColumnCount());
ASSERT_EQ(2, matrix.getEntryCount());
std::vector<float> x({ 4.0, 8.0 });
std::vector<float> b({ 0.0, 0.0 });
ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b));
ASSERT_EQ(b.at(0), 4.0);
ASSERT_EQ(b.at(1), 16.0);
}
TEST(CudaPlugin, AddVectorsInplace) {
std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
@ -93,6 +146,37 @@ TEST(CudaPlugin, AddVectorsInplace) {
}
}
TEST(CudaPlugin, AddVectorsInplace_float) {
std::vector<float> vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
std::vector<float> vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
std::vector<float> vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f };
std::vector<float> vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
std::vector<float> vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f };
ASSERT_EQ(vectorA_1.size(), 8);
ASSERT_EQ(vectorA_2.size(), 8);
ASSERT_EQ(vectorA_3.size(), 8);
ASSERT_EQ(vectorB.size(), 8);
ASSERT_EQ(vectorC.size(), 8);
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB));
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC));
ASSERT_EQ(vectorA_1.size(), 8);
ASSERT_EQ(vectorA_2.size(), 8);
ASSERT_EQ(vectorA_3.size(), 8);
ASSERT_EQ(vectorB.size(), 8);
ASSERT_EQ(vectorC.size(), 8);
for (size_t i = 0; i < vectorA_3.size(); ++i) {
float cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
float cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
}
}
TEST(CudaPlugin, ReduceGroupedVector) {
std::vector<double> groupedVector = {
0.0, -1000.0, 0.000004, // Group 0
@ -138,15 +222,60 @@ TEST(CudaPlugin, ReduceGroupedVector) {
}
}
TEST(CudaPlugin, ReduceGroupedVector_float) {
std::vector<float> groupedVector = {
0.0f, -1000.0f, 0.000004f, // Group 0
5.0f, // Group 1
0.0f, 1.0f, 2.0f, 3.0f, // Group 2
-1000.0f, -3.14f, -0.0002f,// Group 3 (neg only)
25.25f, 25.25f, 25.25f, // Group 4
0.0f, 0.0f, 1.0f, // Group 5
-0.000001f, 0.000001f // Group 6
};
std::vector<uint_fast64_t> grouping = {
0, 3, 4, 8, 11, 14, 17, 19
};
std::vector<float> result_minimize = {
-1000.0f, // Group 0
5.0f,
0.0f,
-1000.0f,
25.25f,
0.0f,
-0.000001f
};
std::vector<float> result_maximize = {
0.000004f,
5.0f,
3.0f,
-0.0002f,
25.25f,
1.0f,
0.000001f
};
std::vector<float> result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
std::vector<float> result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize));
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize));
for (size_t i = 0; i < result_minimize.size(); ++i) {
ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
}
}
TEST(CudaPlugin, equalModuloPrecision) {
std::vector<double> x = {
123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L
123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12
};
std::vector<double> y1 = {
0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L
0.45, 0.8, 0.23, 0.012, 0.456789, -0.12
};
std::vector<double> y2 = {
0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L
0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12
};
std::vector<double> x2;
std::vector<double> x3;
@ -163,21 +292,63 @@ TEST(CudaPlugin, equalModuloPrecision) {
y4.push_back(1.0);
}
double maxElement1 = 0.0L;
double maxElement2 = 0.0L;
double maxElement3 = 0.0L;
double maxElement4 = 0.0L;
double maxElement1 = 0.0;
double maxElement2 = 0.0;
double maxElement3 = 0.0;
double maxElement4 = 0.0;
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4));
ASSERT_DOUBLE_EQ(4567890.0L, maxElement1);
ASSERT_DOUBLE_EQ(901.0L, maxElement2);
ASSERT_DOUBLE_EQ(4567890.0, maxElement1);
ASSERT_DOUBLE_EQ(901.0, maxElement2);
ASSERT_DOUBLE_EQ(998.0, maxElement3);
ASSERT_DOUBLE_EQ(1001.0, maxElement4);
}
TEST(CudaPlugin, equalModuloPrecision_float) {
std::vector<float> x = {
123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f
};
std::vector<float> y1 = {
0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f
};
std::vector<float> y2 = {
0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f
};
std::vector<float> x2;
std::vector<float> x3;
std::vector<float> y3;
std::vector<float> y4;
x2.reserve(1000);
x3.reserve(1000);
y3.reserve(1000);
y4.reserve(1000);
for (size_t i = 0; i < 1000; ++i) {
x2.push_back(static_cast<float>(i));
y3.push_back(1.0f);
x3.push_back(-(1000.0f - static_cast<float>(i)));
y4.push_back(1.0f);
}
float maxElement1 = 0.0f;
float maxElement2 = 0.0f;
float maxElement3 = 0.0f;
float maxElement4 = 0.0f;
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4));
ASSERT_DOUBLE_EQ(4567890.0f, maxElement1);
ASSERT_DOUBLE_EQ(901.0f, maxElement2);
ASSERT_DOUBLE_EQ(998.0L, maxElement3);
ASSERT_DOUBLE_EQ(1001.0L, maxElement4);
ASSERT_DOUBLE_EQ(998.0f, maxElement3);
ASSERT_DOUBLE_EQ(1001.0f, maxElement4);
}
#endif
Loading…
Cancel
Save