From 208005e68bc535a178a668699ac7cfadf6be8b61 Mon Sep 17 00:00:00 2001 From: PBerger Date: Thu, 13 Mar 2014 20:23:47 +0100 Subject: [PATCH] Added Tests to the Cuda Plugin. Refactored kernel for SpMV to use two vectors for column indexes and values. Former-commit-id: 3560d3cc9a8b0addc36f20704c771f41c7476f3e --- .../srcCuda/basicValueIteration.cu | 372 ++++++++++++++++-- .../srcCuda/basicValueIteration.h | 6 +- .../cudaForStorm/srcCuda/cuspExtension.h | 38 +- test/functional/solver/CudaPluginTest.cpp | 122 +++++- 4 files changed, 485 insertions(+), 53 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 712dfac09..e42f36eda 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -25,9 +25,6 @@ std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ } } while(false) -__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { - *B = *A; -} template struct equalModuloPrecision : public thrust::binary_function @@ -36,18 +33,34 @@ __host__ __device__ T operator()(const T &x, const T &y) const { if (Relative) { const T result = (x - y) / y; - return (result > 0) ? result : -result; + return ((result >= 0) ? (result) : (-result)); } else { const T result = (x - y); - return (result > 0) ? result : -result; + return ((result >= 0) ? (result) : (-result)); } } }; +template +void exploadVector(std::vector> const& inputVector, std::vector& indexVector, std::vector& valueVector) { + indexVector.reserve(inputVector.size()); + valueVector.reserve(inputVector.size()); + for (size_t i = 0; i < inputVector.size(); ++i) { + indexVector.push_back(inputVector.at(i).first); + valueVector.push_back(inputVector.at(i).second); + } +} + template void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + + std::vector matrixColumnIndices; + std::vector matrixValues; + exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -76,9 +89,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -128,9 +148,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; + std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } @@ -174,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -197,11 +224,14 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy CUDA_CHECK_ALL_ERRORS(); // Reduce: get Max over x and check for res < Precision - ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum()); + ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits::max(), thrust::maximum()); CUDA_CHECK_ALL_ERRORS(); - converged = maxX < precision; + converged = (maxX < precision); ++iterationCount; + // If there are empty rows in the matrix we need to clear multiplyResult + thrust::fill(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, 0); + // Swap pointers, device_x always contains the most current result std::swap(device_x, device_xSwap); } @@ -223,12 +253,19 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (device_matrixColIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixColIndices = nullptr; + } + if (device_matrixValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndicesAndValues = nullptr; + device_matrixValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -269,8 +306,13 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { + std::vector matrixColumnIndices; + std::vector matrixValues; + exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndicesAndValues = nullptr; + IndexType* device_matrixColIndices = nullptr; + ValueType* device_matrixValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -292,9 +334,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -323,9 +372,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); // Get result back from the device @@ -363,12 +419,19 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndicesAndValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); + if (device_matrixColIndices != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndicesAndValues = nullptr; + device_matrixColIndices = nullptr; + } + if (device_matrixValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixValues); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; + } + device_matrixValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -386,19 +449,266 @@ cleanup: } } +template +void basicValueIteration_addVectorsInplace(std::vector& a, std::vector const& b) { + ValueType* device_a = nullptr; + ValueType* device_b = nullptr; + + const size_t vectorSize = std::max(a.size(), b.size()); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_a), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::device_ptr devicePtrThrust_a(device_a); + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_a != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_a); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; + } + device_a = nullptr; + } + if (device_b != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_b); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; + } + device_b = nullptr; + } +} + +template +void basicValueIteration_reduceGroupedVector(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + ValueType* device_groupedVector = nullptr; + IndexType* device_grouping = nullptr; + ValueType* device_target = nullptr; + + const size_t groupedSize = groupedVector.size(); + const size_t groupingSize = grouping.size(); + const size_t targetSize = targetVector.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_groupedVector), sizeof(ValueType) * groupedSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_grouping), sizeof(IndexType) * groupingSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_target), sizeof(ValueType) * targetSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // Get result back from the device + cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_groupedVector != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_groupedVector); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; + } + device_groupedVector = nullptr; + } + if (device_grouping != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_grouping); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; + } + device_grouping = nullptr; + } + if (device_target != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_target); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; + } + device_target = nullptr; + } +} + +template +void basicValueIteration_equalModuloPrecision(std::vector const& x, std::vector const& y, ValueType& maxElement) { + ValueType* device_x = nullptr; + ValueType* device_y = nullptr; + + const size_t vectorSize = x.size(); + + cudaError_t cudaMallocResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_y), sizeof(ValueType) * vectorSize); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + // Memory allocated, copy data to device + cudaError_t cudaCopyResult; + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + do { + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_y(device_y); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: get Max over x and check for res < Precision + maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits::max(), thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + } while (false); + + // All code related to freeing memory and clearing up the device +cleanup: + if (device_x != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_x); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; + } + device_x = nullptr; + } + if (device_y != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_y); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; + } + device_y = nullptr; + } +} + /* * Declare and implement all exported functions for these Kernels here * */ -void cudaForStormTestFunction(int a, int b) { - std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; -} - void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); } +void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b) { + basicValueIteration_addVectorsInplace(a, b); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector) { + basicValueIteration_reduceGroupedVector(groupedVector, grouping, targetVector); +} + +void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + +void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement) { + basicValueIteration_equalModuloPrecision(x, y, maxElement); +} + void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { if (relativePrecisionCheck) { basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 2395c0311..ad4ce0fc9 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -8,9 +8,13 @@ // Library exports #include "cudaForStorm_Export.h" -cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b); +cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector& a, std::vector const& b); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector const& groupedVector, std::vector const& grouping, std::vector& targetVector); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector const& x, std::vector const& y, double& maxElement); +cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector const& x, std::vector const& y, double& maxElement); #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 34e6e6e14..f07849816 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -61,7 +61,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y) { __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; @@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType // accumulate local sums if(jj >= row_start && jj < row_end) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); // accumulate local sums for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); } else { // accumulate local sums for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); } // store local sum in shared memory @@ -214,7 +214,7 @@ storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * template void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { - ValueType __minMaxInitializer = 0; + ValueType __minMaxInitializer = -std::numeric_limits::max(); if (Minimize) { __minMaxInitializer = std::numeric_limits::max(); } @@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_ } template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const size_t THREADS_PER_BLOCK = 128; const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; @@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType bind_x(x); storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); if (UseCache) unbind_x(x); } template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); } template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); } // NON-OPT diff --git a/test/functional/solver/CudaPluginTest.cpp b/test/functional/solver/CudaPluginTest.cpp index a59697af8..3d2a69c84 100644 --- a/test/functional/solver/CudaPluginTest.cpp +++ b/test/functional/solver/CudaPluginTest.cpp @@ -9,7 +9,7 @@ #include "cudaForStorm.h" -TEST(CudaPlugin, CreationWithDimensions) { +TEST(CudaPlugin, SpMV_4x4) { storm::storage::SparseMatrixBuilder matrixBuilder(4, 4, 10); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0)); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0)); @@ -41,7 +41,7 @@ TEST(CudaPlugin, CreationWithDimensions) { ASSERT_EQ(b.at(3), 0); } -TEST(CudaPlugin, VerySmall) { +TEST(CudaPlugin, SpMV_VerySmall) { storm::storage::SparseMatrixBuilder matrixBuilder(2, 2, 2); ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0)); ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0)); @@ -62,4 +62,122 @@ TEST(CudaPlugin, VerySmall) { ASSERT_EQ(b.at(1), 16.0); } +TEST(CudaPlugin, AddVectorsInplace) { + std::vector vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 }; + std::vector vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 }; + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB)); + ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC)); + + ASSERT_EQ(vectorA_1.size(), 8); + ASSERT_EQ(vectorA_2.size(), 8); + ASSERT_EQ(vectorA_3.size(), 8); + ASSERT_EQ(vectorB.size(), 8); + ASSERT_EQ(vectorC.size(), 8); + + for (size_t i = 0; i < vectorA_3.size(); ++i) { + double cpu_result_b = vectorA_3.at(i) + vectorB.at(i); + double cpu_result_c = vectorA_3.at(i) + vectorC.at(i); + + ASSERT_EQ(cpu_result_b, vectorA_1.at(i)); + ASSERT_EQ(cpu_result_c, vectorA_2.at(i)); + } +} + +TEST(CudaPlugin, ReduceGroupedVector) { + std::vector groupedVector = { + 0.0, -1000.0, 0.000004, // Group 0 + 5.0, // Group 1 + 0.0, 1.0, 2.0, 3.0, // Group 2 + -1000.0, -3.14, -0.0002,// Group 3 (neg only) + 25.25, 25.25, 25.25, // Group 4 + 0.0, 0.0, 1.0, // Group 5 + -0.000001, 0.000001 // Group 6 + }; + std::vector grouping = { + 0, 3, 4, 8, 11, 14, 17, 19 + }; + + std::vector result_minimize = { + -1000.0, // Group 0 + 5.0, + 0.0, + -1000.0, + 25.25, + 0.0, + -0.000001 + }; + std::vector result_maximize = { + 0.000004, + 5.0, + 3.0, + -0.0002, + 25.25, + 1.0, + 0.000001 + }; + + std::vector result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + std::vector result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize)); + ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize)); + + for (size_t i = 0; i < result_minimize.size(); ++i) { + ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i)); + ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i)); + } +} + +TEST(CudaPlugin, equalModuloPrecision) { + std::vector x = { + 123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L + }; + std::vector y1 = { + 0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L + }; + std::vector y2 = { + 0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L + }; + std::vector x2; + std::vector x3; + std::vector y3; + std::vector y4; + x2.reserve(1000); + x3.reserve(1000); + y3.reserve(1000); + y4.reserve(1000); + for (size_t i = 0; i < 1000; ++i) { + x2.push_back(static_cast(i)); + y3.push_back(1.0); + x3.push_back(-(1000.0 - static_cast(i))); + y4.push_back(1.0); + } + + double maxElement1 = 0.0L; + double maxElement2 = 0.0L; + double maxElement3 = 0.0L; + double maxElement4 = 0.0L; + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2)); + + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3)); + ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4)); + + ASSERT_DOUBLE_EQ(4567890.0L, maxElement1); + ASSERT_DOUBLE_EQ(901.0L, maxElement2); + + ASSERT_DOUBLE_EQ(998.0L, maxElement3); + ASSERT_DOUBLE_EQ(1001.0L, maxElement4); +} + #endif \ No newline at end of file