Browse Source

Added Tests to the Cuda Plugin.

Refactored kernel for SpMV to use two vectors for column indexes and values.


Former-commit-id: 3560d3cc9a
tempestpy_adaptions
PBerger 11 years ago
parent
commit
208005e68b
  1. 372
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  2. 6
      resources/cudaForStorm/srcCuda/basicValueIteration.h
  3. 38
      resources/cudaForStorm/srcCuda/cuspExtension.h
  4. 122
      test/functional/solver/CudaPluginTest.cpp

372
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -25,9 +25,6 @@
std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
} } while(false)
__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
*B = *A;
}
template<typename T, bool Relative>
struct equalModuloPrecision : public thrust::binary_function<T,T,T>
@ -36,18 +33,34 @@ __host__ __device__ T operator()(const T &x, const T &y) const
{
if (Relative) {
const T result = (x - y) / y;
return (result > 0) ? result : -result;
return ((result >= 0) ? (result) : (-result));
} else {
const T result = (x - y);
return (result > 0) ? result : -result;
return ((result >= 0) ? (result) : (-result));
}
}
};
template<typename IndexType, typename ValueType>
void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
indexVector.reserve(inputVector.size());
valueVector.reserve(inputVector.size());
for (size_t i = 0; i < inputVector.size(); ++i) {
indexVector.push_back(inputVector.at(i).first);
valueVector.push_back(inputVector.at(i).second);
}
}
template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
std::vector<IndexType> matrixColumnIndices;
std::vector<ValueType> matrixValues;
exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr;
@ -76,9 +89,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
@ -128,9 +148,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
@ -174,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
// Data is on device, start Kernel
while (!converged && iterationCount < maxIterationCount)
{ // In a sub-area since transfer of control via label evades initialization
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -197,11 +224,14 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
CUDA_CHECK_ALL_ERRORS();
// Reduce: get Max over x and check for res < Precision
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum<ValueType>());
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
CUDA_CHECK_ALL_ERRORS();
converged = maxX < precision;
converged = (maxX < precision);
++iterationCount;
// If there are empty rows in the matrix we need to clear multiplyResult
thrust::fill(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, 0);
// Swap pointers, device_x always contains the most current result
std::swap(device_x, device_xSwap);
}
@ -223,12 +253,19 @@ cleanup:
}
device_matrixRowIndices = nullptr;
}
if (device_matrixColIndicesAndValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
if (device_matrixColIndices != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixColIndices = nullptr;
}
if (device_matrixValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixColIndicesAndValues = nullptr;
device_matrixValues = nullptr;
}
if (device_x != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_x);
@ -269,8 +306,13 @@ cleanup:
template <typename IndexType, typename ValueType>
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
std::vector<IndexType> matrixColumnIndices;
std::vector<ValueType> matrixValues;
exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
IndexType* device_matrixColIndices = nullptr;
ValueType* device_matrixValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_multiplyResult = nullptr;
@ -292,9 +334,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
@ -323,9 +372,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
@ -344,7 +400,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
goto cleanup;
}
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
// Get result back from the device
@ -363,12 +419,19 @@ cleanup:
}
device_matrixRowIndices = nullptr;
}
if (device_matrixColIndicesAndValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
if (device_matrixColIndices != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixColIndicesAndValues = nullptr;
device_matrixColIndices = nullptr;
}
if (device_matrixValues != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
}
device_matrixValues = nullptr;
}
if (device_x != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_x);
@ -386,19 +449,266 @@ cleanup:
}
}
template <typename ValueType>
void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
ValueType* device_a = nullptr;
ValueType* device_b = nullptr;
const size_t vectorSize = std::max(a.size(), b.size());
cudaError_t cudaMallocResult;
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
// Memory allocated, copy data to device
cudaError_t cudaCopyResult;
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
do {
// Transform: Add multiplyResult + b inplace to multiplyResult
thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
CUDA_CHECK_ALL_ERRORS();
} while (false);
// Get result back from the device
cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
// All code related to freeing memory and clearing up the device
cleanup:
if (device_a != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_a);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
}
device_a = nullptr;
}
if (device_b != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_b);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
}
device_b = nullptr;
}
}
template <typename IndexType, typename ValueType, bool Minimize>
void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
ValueType* device_groupedVector = nullptr;
IndexType* device_grouping = nullptr;
ValueType* device_target = nullptr;
const size_t groupedSize = groupedVector.size();
const size_t groupingSize = grouping.size();
const size_t targetSize = targetVector.size();
cudaError_t cudaMallocResult;
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
// Memory allocated, copy data to device
cudaError_t cudaCopyResult;
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
do {
// Reduce: Reduce multiplyResult to a new x vector
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
CUDA_CHECK_ALL_ERRORS();
} while (false);
// Get result back from the device
cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
// All code related to freeing memory and clearing up the device
cleanup:
if (device_groupedVector != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
}
device_groupedVector = nullptr;
}
if (device_grouping != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_grouping);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
}
device_grouping = nullptr;
}
if (device_target != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_target);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
}
device_target = nullptr;
}
}
template <typename ValueType, bool Relative>
void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
ValueType* device_x = nullptr;
ValueType* device_y = nullptr;
const size_t vectorSize = x.size();
cudaError_t cudaMallocResult;
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
// Memory allocated, copy data to device
cudaError_t cudaCopyResult;
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
do {
// Transform: x = abs(x - xSwap)/ xSwap
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
CUDA_CHECK_ALL_ERRORS();
// Reduce: get Max over x and check for res < Precision
maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
CUDA_CHECK_ALL_ERRORS();
} while (false);
// All code related to freeing memory and clearing up the device
cleanup:
if (device_x != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_x);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
}
device_x = nullptr;
}
if (device_y != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_y);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
}
device_y = nullptr;
}
}
/*
* Declare and implement all exported functions for these Kernels here
*
*/
void cudaForStormTestFunction(int a, int b) {
std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
}
void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
}
void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
basicValueIteration_addVectorsInplace<double>(a, b);
}
void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
}
void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
}
void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
}
void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
}
void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
if (relativePrecisionCheck) {
basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);

6
resources/cudaForStorm/srcCuda/basicValueIteration.h

@ -8,9 +8,13 @@
// Library exports
#include "cudaForStorm_Export.h"
cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_

38
resources/cudaForStorm/srcCuda/cuspExtension.h

@ -61,7 +61,7 @@ namespace device
template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y)
{
__shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType
// accumulate local sums
if(jj >= row_start && jj < row_end)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
// accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
}
else
{
// accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
}
// store local sum in shared memory
@ -214,7 +214,7 @@ storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType *
template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
{
ValueType __minMaxInitializer = 0;
ValueType __minMaxInitializer = -std::numeric_limits<ValueType>::max();
if (Minimize) {
__minMaxInitializer = std::numeric_limits<ValueType>::max();
}
@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_
}
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType
bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
if (UseCache)
unbind_x(x);
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
__storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
__storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
// NON-OPT

122
test/functional/solver/CudaPluginTest.cpp

@ -9,7 +9,7 @@
#include "cudaForStorm.h"
TEST(CudaPlugin, CreationWithDimensions) {
TEST(CudaPlugin, SpMV_4x4) {
storm::storage::SparseMatrixBuilder<double> matrixBuilder(4, 4, 10);
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0));
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0));
@ -41,7 +41,7 @@ TEST(CudaPlugin, CreationWithDimensions) {
ASSERT_EQ(b.at(3), 0);
}
TEST(CudaPlugin, VerySmall) {
TEST(CudaPlugin, SpMV_VerySmall) {
storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2);
ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
@ -62,4 +62,122 @@ TEST(CudaPlugin, VerySmall) {
ASSERT_EQ(b.at(1), 16.0);
}
TEST(CudaPlugin, AddVectorsInplace) {
std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
std::vector<double> vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
std::vector<double> vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
std::vector<double> vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 };
ASSERT_EQ(vectorA_1.size(), 8);
ASSERT_EQ(vectorA_2.size(), 8);
ASSERT_EQ(vectorA_3.size(), 8);
ASSERT_EQ(vectorB.size(), 8);
ASSERT_EQ(vectorC.size(), 8);
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB));
ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC));
ASSERT_EQ(vectorA_1.size(), 8);
ASSERT_EQ(vectorA_2.size(), 8);
ASSERT_EQ(vectorA_3.size(), 8);
ASSERT_EQ(vectorB.size(), 8);
ASSERT_EQ(vectorC.size(), 8);
for (size_t i = 0; i < vectorA_3.size(); ++i) {
double cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
double cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
}
}
TEST(CudaPlugin, ReduceGroupedVector) {
std::vector<double> groupedVector = {
0.0, -1000.0, 0.000004, // Group 0
5.0, // Group 1
0.0, 1.0, 2.0, 3.0, // Group 2
-1000.0, -3.14, -0.0002,// Group 3 (neg only)
25.25, 25.25, 25.25, // Group 4
0.0, 0.0, 1.0, // Group 5
-0.000001, 0.000001 // Group 6
};
std::vector<uint_fast64_t> grouping = {
0, 3, 4, 8, 11, 14, 17, 19
};
std::vector<double> result_minimize = {
-1000.0, // Group 0
5.0,
0.0,
-1000.0,
25.25,
0.0,
-0.000001
};
std::vector<double> result_maximize = {
0.000004,
5.0,
3.0,
-0.0002,
25.25,
1.0,
0.000001
};
std::vector<double> result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
std::vector<double> result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize));
ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize));
for (size_t i = 0; i < result_minimize.size(); ++i) {
ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
}
}
TEST(CudaPlugin, equalModuloPrecision) {
std::vector<double> x = {
123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L
};
std::vector<double> y1 = {
0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L
};
std::vector<double> y2 = {
0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L
};
std::vector<double> x2;
std::vector<double> x3;
std::vector<double> y3;
std::vector<double> y4;
x2.reserve(1000);
x3.reserve(1000);
y3.reserve(1000);
y4.reserve(1000);
for (size_t i = 0; i < 1000; ++i) {
x2.push_back(static_cast<double>(i));
y3.push_back(1.0);
x3.push_back(-(1000.0 - static_cast<double>(i)));
y4.push_back(1.0);
}
double maxElement1 = 0.0L;
double maxElement2 = 0.0L;
double maxElement3 = 0.0L;
double maxElement4 = 0.0L;
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3));
ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4));
ASSERT_DOUBLE_EQ(4567890.0L, maxElement1);
ASSERT_DOUBLE_EQ(901.0L, maxElement2);
ASSERT_DOUBLE_EQ(998.0L, maxElement3);
ASSERT_DOUBLE_EQ(1001.0L, maxElement4);
}
#endif
Loading…
Cancel
Save