diff --git a/CMakeLists.txt b/CMakeLists.txt index 072e50b68..cc2f18d67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,7 +320,7 @@ if (ENABLE_Z3) link_directories("${Z3_ROOT}/bin") endif() if (STORM_USE_CUDAFORSTORM) - link_directories("${PROJECT_SOURCE_DIR}/build/cudaForStorm/lib") + link_directories("${PROJECT_BINARY_DIR}/cudaForStorm/lib") endif() if ((NOT Boost_LIBRARY_DIRS) OR ("${Boost_LIBRARY_DIRS}" STREQUAL "")) set(Boost_LIBRARY_DIRS "${Boost_INCLUDE_DIRS}/stage/lib") diff --git a/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp new file mode 100644 index 000000000..7b3b7a8b1 --- /dev/null +++ b/resources/cudaForStorm/CMakeFloatAlignmentCheck.cpp @@ -0,0 +1,31 @@ +/* + * This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries + */ + #include + #include + #include + + #define CONTAINER_SIZE 100ul + +int main(int argc, char* argv[]) { + int result = 0; + + std::vector> myVector; + for (size_t i = 0; i < CONTAINER_SIZE; ++i) { + myVector.push_back(std::make_pair(i, 42.12345f * i)); + } + + char* firstUintPointer = reinterpret_cast(&(myVector.at(0).first)); + char* secondUintPointer = reinterpret_cast(&(myVector.at(1).first)); + ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; + + if (uintDiff == (2 * sizeof(uint_fast64_t))) { + result = 2; + } else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { + result = 3; + } else { + result = -5; + } + + return result; + } \ No newline at end of file diff --git a/resources/cudaForStorm/CMakeLists.txt b/resources/cudaForStorm/CMakeLists.txt index 7bc37a097..d7d525386 100644 --- a/resources/cudaForStorm/CMakeLists.txt +++ b/resources/cudaForStorm/CMakeLists.txt @@ -131,6 +131,24 @@ else() message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})") endif() +# Test for Float 64bit Alignment +try_run(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT + ${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp" + COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR +) +if(NOT STORM_CUDA_COMPILE_RESULT_FLOATALIGNMENT) + message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test float type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeFloatAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 2) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment active.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "define") +elseif(STORM_CUDA_RUN_RESULT_FLOATALIGNMENT EQUAL 3) + message(STATUS "StoRM (CudaPlugin) - Result of Float Type Alignment Check: 64bit alignment disabled.") + set(STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF "undef") +else() + message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Float Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_FLOATALIGNMENT})") +endif() + + # # Make a version file containing the current version from git. # diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 6e84d5e70..6aa4a2fb4 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -10,20 +10,15 @@ #include "utility.h" #include "cuspExtension.h" + #include #include #include +#include "storm-cudaplugin-config.h" + #ifdef DEBUG -#define CUDA_CHECK_ALL_ERRORS() do { \ - cudaError_t errSync = cudaGetLastError(); \ - cudaError_t errAsync = cudaDeviceSynchronize(); \ - if (errSync != cudaSuccess) { \ - std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \ - } \ - if (errAsync != cudaSuccess) { \ - std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ - } } while(false) +#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) #else #define CUDA_CHECK_ALL_ERRORS() do {} while (false) #endif @@ -56,15 +51,16 @@ void exploadVector(std::vector> const& inputVect } } +// TEMPLATE VERSION template -bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { +bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { //std::vector matrixColumnIndices; //std::vector matrixValues; //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); bool errorOccured = false; IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -74,7 +70,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #ifdef DEBUG std::cout.sync_with_stdio(true); std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; - std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; #endif @@ -96,12 +92,27 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; - errorOccured = true; - goto cleanup; +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true +#else +#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false +#endif + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + errorOccured = true; + goto cleanup; + } } CUDA_CHECK_ALL_ERRORS(); @@ -159,12 +170,23 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy goto cleanup; } - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; - errorOccured = true; - goto cleanup; + // Copy all data as floats are expanded to 64bits :/ + if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } + } else { + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + errorOccured = true; + goto cleanup; + } } CUDA_CHECK_ALL_ERRORS(); @@ -214,9 +236,8 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy #endif // Data is on device, start Kernel - while (!converged && iterationCount < maxIterationCount) - { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -227,7 +248,7 @@ bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy CUDA_CHECK_ALL_ERRORS(); // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); // Check for convergence @@ -338,7 +359,7 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + ValueType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -362,12 +383,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } +#else CUDA_CHECK_ALL_ERRORS(); cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } +#endif CUDA_CHECK_ALL_ERRORS(); cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); @@ -397,12 +427,21 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); #ifdef DEBUG @@ -601,7 +640,7 @@ void basicValueIteration_reduceGroupedVector(std::vector const& group do { // Reduce: Reduce multiplyResult to a new x vector - cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); + cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); CUDA_CHECK_ALL_ERRORS(); } while (false); @@ -713,7 +752,7 @@ cleanup: */ void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); } void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { @@ -736,6 +775,31 @@ void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector(x, y, maxElement); } +// Float +void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); +} + +void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { if (relativePrecisionCheck) { return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); @@ -752,6 +816,22 @@ bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const max } } +bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + +bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount) { + if (relativePrecisionCheck) { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } else { + return basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); + } +} + size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { size_t const valueTypeSize = sizeof(double); size_t const indexTypeSize = sizeof(uint_fast64_t); @@ -772,5 +852,28 @@ size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t con // Vectors x, xSwap, b, multiplyResult size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + return (matrixDataSize + vectorSizes); +} + +size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { + size_t const valueTypeSize = sizeof(float); + size_t const indexTypeSize = sizeof(uint_fast64_t); + + /* + IndexType* device_matrixRowIndices = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; + ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; + ValueType* device_b = nullptr; + ValueType* device_multiplyResult = nullptr; + IndexType* device_nondeterministicChoiceIndices = nullptr; + */ + + // Row Indices, Column Indices, Values, Choice Indices + size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); + // Vectors x, xSwap, b, multiplyResult + size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); + return (matrixDataSize + vectorSizes); } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 728ba07f9..09b4be5ca 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -85,6 +85,11 @@ template cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + +cudaForStorm_EXPORT size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); +cudaForStorm_EXPORT bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices, size_t& iterationCount); + cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); @@ -92,4 +97,11 @@ cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_m cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); +cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_float(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_Relative(std::vector const& x, std::vector const& y, float& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector const& x, std::vector const& y, float& maxElement); + #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 7c9a9fb28..11c673bf9 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -1,338 +1,47 @@ -/* - * This is an extension of the original CUSP csr_vector.h SPMV implementation. - * It is based on the Code and incorporates changes as to cope with the details - * of the StoRM code. - * As this is mostly copy & paste, the original license still applies. - */ - -/* - * Copyright 2008-2009 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #pragma once -#include -#include -#include - -namespace cusp -{ -namespace detail -{ -namespace device -{ - -////////////////////////////////////////////////////////////////////////////// -// CSR SpMV kernels based on a vector model (one warp per row) -////////////////////////////////////////////////////////////////////////////// -// -// spmv_csr_vector_device -// Each row of the CSR matrix is assigned to a warp. The warp computes -// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with -// the x vector, in parallel. This division of work implies that -// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous -// manner (but generally not aligned). On GT200 these accesses are -// coalesced, unlike kernels based on the one-row-per-thread division of -// work. Since an entire 32-thread warp is assigned to each row, many -// threads will remain idle when their row contains a small number -// of elements. This code relies on implicit synchronization among -// threads in a warp. -// -// spmv_csr_vector_tex_device -// Same as spmv_csr_vector_tex_device, except that the texture cache is -// used for accessing the x vector. -// -// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] - - -template -__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) -__global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) -{ - __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals - __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; - - const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; - - const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector - const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index - const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block - const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(IndexType row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; - - const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local sum - ValueType sum = 0; - - if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - - // accumulate local sums - for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - else - { - // accumulate local sums - for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); - } - - // store local sum in shared memory - sdata[threadIdx.x] = sum; - - // reduce local sums to row sum - if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; - if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; - if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; - if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; - if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; - - // first thread writes the result - if (thread_lane == 0) - y[row] = sdata[threadIdx.x]; - } -} - -template -__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) -__global__ void -storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer) -{ - __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals - __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2]; - - const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; - - const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index - const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector - const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index - const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block - const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors - - for(IndexType row = vector_id; row < num_rows; row += num_vectors) - { - // use two threads to fetch Ap[row] and Ap[row+1] - // this is considerably faster than the straightforward version - if(thread_lane < 2) - ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; - - const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; - const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; - - // initialize local Min/Max - ValueType localMinMaxElement = minMaxInitializer; - - if (THREADS_PER_ROW == 32 && row_end - row_start > 32) - { - // ensure aligned memory access to Aj and Ax - - IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; - - // accumulate local sums - if(jj >= row_start && jj < row_end) { - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } +#include "cuspExtensionFloat.h" +#include "cuspExtensionDouble.h" - // accumulate local sums - for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } - else - { - // accumulate local sums - for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) - if(Minimize) { - localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; - } else { - localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; - } - } +namespace cusp { +namespace detail { +namespace device { - // store local sum in shared memory - sdata[threadIdx.x] = localMinMaxElement; - - // reduce local min/max to row min/max - if (Minimize) { - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); - } else { - if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); - if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); - if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); - if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); - if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); - } - - // first thread writes the result - if (thread_lane == 0) - x[row] = sdata[threadIdx.x]; - } +template +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { + // + throw; } - -template -void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) -{ - ValueType __minMaxInitializer = -std::numeric_limits::max(); - if (Minimize) { - __minMaxInitializer = std::numeric_limits::max(); - } - const ValueType minMaxInitializer = __minMaxInitializer; - - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - storm_cuda_opt_vector_reduce_kernel <<>> - (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); -} - -template -void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) -{ - const IndexType rows_per_group = num_entries / num_rows; - - if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } - - __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { + storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } - -template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); - - if (UseCache) - unbind_x(x); +template <> +void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { + storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } -template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +template +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { + // + throw; } - -template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } - -// NON-OPT - -template -void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const size_t THREADS_PER_BLOCK = 128; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); - - if (UseCache) - bind_x(x); - - spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); - - if (UseCache) - unbind_x(x); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { + storm_cuda_opt_vector_reduce_double(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } -template -void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } - -template -void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) -{ - const IndexType nnz_per_row = num_entries / num_rows; - - if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - - __storm_cuda_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +template <> +void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { + storm_cuda_opt_vector_reduce_float(num_rows, num_entries, nondeterministicChoiceIndices, x, y); } } // end namespace device diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h new file mode 100644 index 000000000..eee59b007 --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionDouble.h @@ -0,0 +1,361 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & paste, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y) +{ + __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + double sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 2 * jj), x); + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer) +{ + __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + double localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + double __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const double minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_double <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_double(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_double <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_double(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h new file mode 100644 index 000000000..bb9acf78e --- /dev/null +++ b/resources/cudaForStorm/srcCuda/cuspExtensionFloat.h @@ -0,0 +1,375 @@ +/* + * This is an extension of the original CUSP csr_vector.h SPMV implementation. + * It is based on the Code and incorporates changes as to cope with the details + * of the StoRM code. + * As this is mostly copy & paste, the original license still applies. + */ + +/* + * Copyright 2008-2009 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include + +#include + +#include + +#include "storm-cudaplugin-config.h" + +namespace cusp +{ +namespace detail +{ +namespace device +{ + +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) +{ + __shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + float sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); + } + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } else { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { + //sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); +#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 4 * jj), x); +#else + sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x(*reinterpret_cast(matrixColumnIndicesAndValues + 3 * jj), x); +#endif + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) +{ + __shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; + + const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index + const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + float localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = min(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = max(localMinMaxElement, y[jj]); + //localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); + } else { + /*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + float __minMaxInitializer = -std::numeric_limits::max(); + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const float minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel_float <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) +{ + const uint_fast64_t rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce_float(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel_float <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + +template +void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + + if (UseCache) + unbind_x(x); +} + +void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) +{ + const uint_fast64_t nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + + __storm_cuda_spmv_csr_vector_float(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); +} + +} // end namespace device +} // end namespace detail +} // end namespace cusp \ No newline at end of file diff --git a/resources/cudaForStorm/storm-cudaplugin-config.h.in b/resources/cudaForStorm/storm-cudaplugin-config.h.in index 1cfc9119e..3703d0c81 100644 --- a/resources/cudaForStorm/storm-cudaplugin-config.h.in +++ b/resources/cudaForStorm/storm-cudaplugin-config.h.in @@ -2,7 +2,6 @@ * StoRM - Build-in Options * * This file is parsed by CMake during makefile generation - * It contains information such as the base path to the test/example data */ #ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ @@ -16,4 +15,7 @@ #define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on #define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else +// Whether the size of float in a pair is expanded to 64bit +#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT + #endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ diff --git a/src/models/PseudoModel.cpp b/src/models/PseudoModel.cpp index 7f26e4f36..af446778c 100644 --- a/src/models/PseudoModel.cpp +++ b/src/models/PseudoModel.cpp @@ -101,6 +101,9 @@ namespace storm { template class ModelBasedPseudoModel; template class NonDeterministicMatrixBasedPseudoModel; template class DeterministicMatrixBasedPseudoModel; + template class ModelBasedPseudoModel ; + template class NonDeterministicMatrixBasedPseudoModel ; + template class DeterministicMatrixBasedPseudoModel ; template class ModelBasedPseudoModel; template class NonDeterministicMatrixBasedPseudoModel; template class DeterministicMatrixBasedPseudoModel; diff --git a/src/solver/NativeNondeterministicLinearEquationSolver.cpp b/src/solver/NativeNondeterministicLinearEquationSolver.cpp index 7f88258f1..3fd213ad9 100644 --- a/src/solver/NativeNondeterministicLinearEquationSolver.cpp +++ b/src/solver/NativeNondeterministicLinearEquationSolver.cpp @@ -76,7 +76,7 @@ namespace storm { } // Determine whether the method converged. - converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *newX, precision, relative); // Update environment variables. std::swap(currentX, newX); @@ -140,5 +140,6 @@ namespace storm { // Explicitly instantiate the solver. template class NativeNondeterministicLinearEquationSolver; + template class NativeNondeterministicLinearEquationSolver; } // namespace solver } // namespace storm diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index 4955797ed..8a09d9e7d 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -42,9 +42,206 @@ namespace storm { NondeterministicLinearEquationSolver* TopologicalValueIterationNondeterministicLinearEquationSolver::clone() const { return new TopologicalValueIterationNondeterministicLinearEquationSolver(*this); } + + template<> + void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + // For testing only + LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker WITH FLOAT! <<<"); + + // Now, we need to determine the SCCs of the MDP and perform a topological sort. + std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); + storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); + storm::storage::StronglyConnectedComponentDecomposition sccDecomposition(pseudoModel, false, false); + + if (sccDecomposition.size() == 0) { + LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs."); + throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs."; + } + + storm::storage::SparseMatrix stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition); + std::vector topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph); + + // Calculate the optimal distribution of sccs + std::vector> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A); + LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs."); + + std::vector* currentX = nullptr; + std::vector* swap = nullptr; + size_t currentMaxLocalIterations = 0; + size_t localIterations = 0; + size_t globalIterations = 0; + bool converged = true; + + // Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only + // solved after all SCCs it depends on have been solved. + int counter = 0; + + for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) { + bool const useGpu = sccIndexIt->first; + storm::storage::StateBlock const& scc = sccIndexIt->second; + + // Generate a sub matrix + storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); + storm::storage::SparseMatrix sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices); + std::vector sccSubB(sccSubmatrix.getRowCount()); + storm::utility::vector::selectVectorValues(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b); + std::vector sccSubX(sccSubmatrix.getColumnCount()); + std::vector sccSubXSwap(sccSubmatrix.getColumnCount()); + std::vector sccMultiplyResult(sccSubmatrix.getRowCount()); + + // Prepare the pointers for swapping in the calculation + currentX = &sccSubX; + swap = &sccSubXSwap; + + storm::utility::vector::selectVectorValues(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states) + std::vector sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1); + sccSubNondeterministicChoiceIndices.at(0) = 0; + + // Pre-process all dependent states + // Remove outgoing transitions and create the ChoiceIndices + uint_fast64_t innerIndex = 0; + uint_fast64_t outerIndex = 0; + for (uint_fast64_t state : scc) { + // Choice Indices + sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]); + + for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) { + storm::storage::SparseMatrix::const_rows row = A.getRow(rowGroupIt); + for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) { + if (!subMatrixIndices.get(rowIt->getColumn())) { + // This is an outgoing transition of a state in the SCC to a state not included in the SCC + // Subtracting Pr(tau) * x_other from b fixes that + sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn())); + } + } + ++innerIndex; + } + ++outerIndex; + } + + // For the current SCC, we need to perform value iteration until convergence. + if (useGpu) { +#ifdef STORM_HAVE_CUDAFORSTORM + if (!resetCudaDevice()) { + LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver."); + throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver."; + } + + //LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)."); + //LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes."); + //LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion()); + + bool result = false; + localIterations = 0; + if (minimize) { + result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } else { + result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations); + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU."); + + if (!result) { + converged = false; + LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue."); + throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue."; + } else { + converged = true; + } + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + +#else + LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"); + throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!"; +#endif + } else { + localIterations = 0; + converged = false; + while (!converged && localIterations < this->maximalNumberOfIterations) { + // Compute x' = A*x + b. + sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult); + storm::utility::vector::addVectorsInPlace(sccMultiplyResult, sccSubB); + + //A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult); + //storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b); + + /* + Versus: + A.multiplyWithVector(*currentX, *multiplyResult); + storm::utility::vector::addVectorsInPlace(*multiplyResult, b); + */ + + // Reduce the vector x' by applying min/max for all non-deterministic choices. + if (minimize) { + storm::utility::vector::reduceVectorMin(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } else { + storm::utility::vector::reduceVectorMax(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices); + } + + // Determine whether the method converged. + // TODO: It seems that the equalModuloPrecision call that compares all values should have a higher + // running time. In fact, it is faster. This has to be investigated. + // converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative); + converged = storm::utility::vector::equalModuloPrecision(*currentX, *swap, this->precision, this->relative); + + // Update environment variables. + std::swap(currentX, swap); + + ++localIterations; + ++globalIterations; + } + LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations."); + } + + + // The Result of this SCC has to be taken back into the main result vector + innerIndex = 0; + for (uint_fast64_t state : scc) { + x.at(state) = currentX->at(innerIndex); + ++innerIndex; + } + + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore + currentX = nullptr; + swap = nullptr; + + // As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep + // track of the maximum. + if (localIterations > currentMaxLocalIterations) { + currentMaxLocalIterations = localIterations; + } + } + + // Check if the solver converged and issue a warning otherwise. + if (converged) { + LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations."); + } else { + LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); + } + } template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { + +#ifndef GPU_USE_DOUBLE + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; + TopologicalValueIterationNondeterministicLinearEquationSolver tvindles(precision, maximalNumberOfIterations, relative); + + storm::storage::SparseMatrix new_A = A.toValueType(); + std::vector new_x = storm::utility::vector::toValueType(x); + std::vector const new_b = storm::utility::vector::toValueType(b); + + tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr); + + for (size_t i = 0, size = new_x.size(); i < size; ++i) { + x.at(i) = new_x.at(i); + } +#else + std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl; // For testing only LOG4CPLUS_INFO(logger, ">>> Using GPU based model checker! <<<"); @@ -224,6 +421,7 @@ namespace storm { else { LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations."); } +#endif } template diff --git a/src/storage/SparseMatrix.cpp b/src/storage/SparseMatrix.cpp index c300e6858..ff7d71db5 100644 --- a/src/storage/SparseMatrix.cpp +++ b/src/storage/SparseMatrix.cpp @@ -1000,6 +1000,10 @@ namespace storm { template class SparseMatrix; template std::ostream& operator<<(std::ostream& out, SparseMatrix const& matrix); + template class MatrixEntry < float > ; + template class SparseMatrixBuilder < float >; + template class SparseMatrix < float >; + template std::ostream& operator<<(std::ostream& out, SparseMatrix const& matrix); } // namespace storage } // namespace storm diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index a06fbeb60..2a3dcc7c1 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -733,6 +733,23 @@ namespace storm { * Returns a reference to the internal columnMapping vector */ std::vector> const& __internal_getColumnsAndValues(); + + /*! + * Returns a copy of the matrix with the chosen internal data type + */ + template + SparseMatrix toValueType() const { + std::vector> new_columnsAndValues; + std::vector new_rowIndications(rowIndications); + std::vector new_rowGroupIndices(rowGroupIndices); + + new_columnsAndValues.resize(columnsAndValues.size()); + for (size_t i = 0, size = columnsAndValues.size(); i < size; ++i) { + new_columnsAndValues.at(i) = MatrixEntry(columnsAndValues.at(i).getColumn(), static_cast(columnsAndValues.at(i).getValue())); + } + + return SparseMatrix(columnCount, std::move(new_rowIndications), std::move(new_columnsAndValues), std::move(new_rowGroupIndices)); + } private: /*! * Creates a submatrix of the current matrix by keeping only row groups and columns in the given row group diff --git a/src/storage/StronglyConnectedComponentDecomposition.cpp b/src/storage/StronglyConnectedComponentDecomposition.cpp index 454ba9e00..7c584a312 100644 --- a/src/storage/StronglyConnectedComponentDecomposition.cpp +++ b/src/storage/StronglyConnectedComponentDecomposition.cpp @@ -223,5 +223,6 @@ namespace storm { // Explicitly instantiate the SCC decomposition. template class StronglyConnectedComponentDecomposition; + template class StronglyConnectedComponentDecomposition; } // namespace storage } // namespace storm \ No newline at end of file diff --git a/src/utility/vector.h b/src/utility/vector.h index 79fd28046..1547903fa 100644 --- a/src/utility/vector.h +++ b/src/utility/vector.h @@ -422,6 +422,20 @@ namespace storm { return subVector; } + + /*! + * Converts the given vector to the given ValueType + */ + template + std::vector toValueType(std::vector const& oldVector) { + std::vector resultVector; + resultVector.resize(oldVector.size()); + for (size_t i = 0, size = oldVector.size(); i < size; ++i) { + resultVector.at(i) = static_cast(oldVector.at(i)); + } + + return resultVector; + } } // namespace vector } // namespace utility } // namespace storm diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp index 3d2a69c84..da52f3e2b 100644 --- a/test/functional/solver/CudaPluginTest.cpp +++ b/test/functional/solver/CudaPluginTest.cpp @@ -41,6 +41,38 @@ TEST(CudaPlugin, SpMV_4x4) { ASSERT_EQ(b.at(3), 0); } +TEST(CudaPlugin, SpMV_4x4_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 0, 8.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 7.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 2, -5.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 3, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 0, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 1, 2.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 2, 4.0f)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(2, 3, 4.0f)); + + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(4, matrix.getRowCount()); + ASSERT_EQ(4, matrix.getColumnCount()); + ASSERT_EQ(10, matrix.getEntryCount()); + + std::vector x({ 0.f, 4.f, 1.f, 1.f }); + std::vector b({ 0.f, 0.f, 0.f, 0.f }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 3); + ASSERT_EQ(b.at(1), 25); + ASSERT_EQ(b.at(2), 16); + ASSERT_EQ(b.at(3), 0); +} + TEST(CudaPlugin, SpMV_VerySmall) { storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); @@ -62,6 +94,27 @@ TEST(CudaPlugin, SpMV_VerySmall) { ASSERT_EQ(b.at(1), 16.0); } +TEST(CudaPlugin, SpMV_VerySmall_float) { + storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); + ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); + ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); + + storm::storage::SparseMatrix matrix; + ASSERT_NO_THROW(matrix = matrixBuilder.build()); + + ASSERT_EQ(2, matrix.getRowCount()); + ASSERT_EQ(2, matrix.getColumnCount()); + ASSERT_EQ(2, matrix.getEntryCount()); + + std::vector x({ 4.0, 8.0 }); + std::vector b({ 0.0, 0.0 }); + + ASSERT_NO_THROW(basicValueIteration_spmv_uint64_float(matrix.getColumnCount(), matrix.__internal_getRowIndications(), matrix.__internal_getColumnsAndValues(), x, b)); + + ASSERT_EQ(b.at(0), 4.0); + ASSERT_EQ(b.at(1), 16.0); +} + TEST(CudaPlugin, AddVectorsInplace) { std::vector vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; std::vector vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; @@ -93,6 +146,37 @@ TEST(CudaPlugin, AddVectorsInplace) { } } +TEST(CudaPlugin, AddVectorsInplace_float) { + std::vector vectorA_1 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_2 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorA_3 = { 0.0f, 42.0f, 21.4f, 3.1415f, 1.0f, 7.3490390f, 94093053905390.21f, -0.000000000023f }; + std::vector vectorB = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector vectorC = { -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f, -5000.0f }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_float(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + float cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + float cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + TEST(CudaPlugin, ReduceGroupedVector) { std::vector groupedVector = { 0.0, -1000.0, 0.000004, // Group 0 @@ -138,15 +222,60 @@ TEST(CudaPlugin, ReduceGroupedVector) { } } +TEST(CudaPlugin, ReduceGroupedVector_float) { + std::vector groupedVector = { + 0.0f, -1000.0f, 0.000004f, // Group 0 + 5.0f, // Group 1 + 0.0f, 1.0f, 2.0f, 3.0f, // Group 2 + -1000.0f, -3.14f, -0.0002f,// Group 3 (neg only) + 25.25f, 25.25f, 25.25f, // Group 4 + 0.0f, 0.0f, 1.0f, // Group 5 + -0.000001f, 0.000001f // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0f, // Group 0 + 5.0f, + 0.0f, + -1000.0f, + 25.25f, + 0.0f, + -0.000001f + }; + std::vector result_maximize = { + 0.000004f, + 5.0f, + 3.0f, + -0.0002f, + 25.25f, + 1.0f, + 0.000001f + }; + + std::vector result_cuda_minimize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + std::vector result_cuda_maximize = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_float_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + TEST(CudaPlugin, equalModuloPrecision) { std::vector x = { - 123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L + 123.45, 67.8, 901.23, 456789.012, 3.456789, -4567890.12 }; std::vector y1 = { - 0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L + 0.45, 0.8, 0.23, 0.012, 0.456789, -0.12 }; std::vector y2 = { - 0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L + 0.45, 0.8, 0.23, 456789.012, 0.456789, -4567890.12 }; std::vector x2; std::vector x3; @@ -163,21 +292,63 @@ TEST(CudaPlugin, equalModuloPrecision) { y4.push_back(1.0); } - double maxElement1 = 0.0L; - double maxElement2 = 0.0L; - double maxElement3 = 0.0L; - double maxElement4 = 0.0L; + double maxElement1 = 0.0; + double maxElement2 = 0.0; + double maxElement3 = 0.0; + double maxElement4 = 0.0; ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); - ASSERT_DOUBLE_EQ(4567890.0L, maxElement1); - ASSERT_DOUBLE_EQ(901.0L, maxElement2); + ASSERT_DOUBLE_EQ(4567890.0, maxElement1); + ASSERT_DOUBLE_EQ(901.0, maxElement2); + + ASSERT_DOUBLE_EQ(998.0, maxElement3); + ASSERT_DOUBLE_EQ(1001.0, maxElement4); +} + +TEST(CudaPlugin, equalModuloPrecision_float) { + std::vector x = { + 123.45f, 67.8f, 901.23f, 456789.012f, 3.456789f, -4567890.12f + }; + std::vector y1 = { + 0.45f, 0.8f, 0.23f, 0.012f, 0.456789f, -0.12f + }; + std::vector y2 = { + 0.45f, 0.8f, 0.23f, 456789.012f, 0.456789f, -4567890.12f + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0f); + x3.push_back(-(1000.0f - static_cast(i))); + y4.push_back(1.0f); + } + + float maxElement1 = 0.0f; + float maxElement2 = 0.0f; + float maxElement3 = 0.0f; + float maxElement4 = 0.0f; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_float_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0f, maxElement1); + ASSERT_DOUBLE_EQ(901.0f, maxElement2); - ASSERT_DOUBLE_EQ(998.0L, maxElement3); - ASSERT_DOUBLE_EQ(1001.0L, maxElement4); + ASSERT_DOUBLE_EQ(998.0f, maxElement3); + ASSERT_DOUBLE_EQ(1001.0f, maxElement4); } #endif \ No newline at end of file