You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
879 lines
40 KiB
879 lines
40 KiB
#include "basicValueIteration.h"
|
|
#define CUSP_USE_TEXTURE_MEMORY
|
|
|
|
#include <iostream>
|
|
#include <chrono>
|
|
|
|
#include <cuda_runtime.h>
|
|
#include "cusparse_v2.h"
|
|
|
|
#include "utility.h"
|
|
|
|
#include "cuspExtension.h"
|
|
|
|
#include <thrust/transform.h>
|
|
#include <thrust/device_ptr.h>
|
|
#include <thrust/functional.h>
|
|
|
|
#include "storm-cudaplugin-config.h"
|
|
|
|
#ifdef DEBUG
|
|
#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false)
|
|
#else
|
|
#define CUDA_CHECK_ALL_ERRORS() do {} while (false)
|
|
#endif
|
|
|
|
template<typename T, bool Relative>
|
|
struct equalModuloPrecision : public thrust::binary_function<T,T,T>
|
|
{
|
|
__host__ __device__ T operator()(const T &x, const T &y) const
|
|
{
|
|
if (Relative) {
|
|
if (y == 0) {
|
|
return ((x >= 0) ? (x) : (-x));
|
|
}
|
|
const T result = (x - y) / y;
|
|
return ((result >= 0) ? (result) : (-result));
|
|
} else {
|
|
const T result = (x - y);
|
|
return ((result >= 0) ? (result) : (-result));
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename IndexType, typename ValueType>
|
|
void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
|
|
indexVector.reserve(inputVector.size());
|
|
valueVector.reserve(inputVector.size());
|
|
for (size_t i = 0; i < inputVector.size(); ++i) {
|
|
indexVector.push_back(inputVector.at(i).first);
|
|
valueVector.push_back(inputVector.at(i).second);
|
|
}
|
|
}
|
|
|
|
// TEMPLATE VERSION
|
|
template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
|
|
bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) {
|
|
//std::vector<IndexType> matrixColumnIndices;
|
|
//std::vector<ValueType> matrixValues;
|
|
//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
|
|
bool errorOccured = false;
|
|
|
|
IndexType* device_matrixRowIndices = nullptr;
|
|
ValueType* device_matrixColIndicesAndValues = nullptr;
|
|
ValueType* device_x = nullptr;
|
|
ValueType* device_xSwap = nullptr;
|
|
ValueType* device_b = nullptr;
|
|
ValueType* device_multiplyResult = nullptr;
|
|
IndexType* device_nondeterministicChoiceIndices = nullptr;
|
|
|
|
#ifdef DEBUG
|
|
std::cout.sync_with_stdio(true);
|
|
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl;
|
|
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
|
|
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
|
|
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
|
|
#endif
|
|
|
|
const IndexType matrixRowCount = matrixRowIndices.size() - 1;
|
|
const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
|
|
const IndexType matrixNnzCount = columnIndicesAndValues.size();
|
|
|
|
cudaError_t cudaMallocResult;
|
|
|
|
bool converged = false;
|
|
iterationCount = 0;
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT
|
|
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true
|
|
#else
|
|
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false
|
|
#endif
|
|
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
} else {
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1));
|
|
if (cudaMallocResult != cudaSuccess) {
|
|
std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
std::cout << "(DLL) Finished allocating memory." << std::endl;
|
|
#endif
|
|
|
|
// Memory allocated, copy data to device
|
|
cudaError_t cudaCopyResult;
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Copy all data as floats are expanded to 64bits :/
|
|
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) {
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
} else {
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Preset the xSwap to zeros...
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Preset the multiplyResult to zeros...
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl;
|
|
#endif
|
|
|
|
// Data is on device, start Kernel
|
|
while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization
|
|
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
|
|
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
|
|
thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
|
|
|
|
// Transform: Add multiplyResult + b inplace to multiplyResult
|
|
thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
|
|
// Reduce: Reduce multiplyResult to a new x vector
|
|
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
|
|
// Check for convergence
|
|
// Transform: x = abs(x - xSwap)/ xSwap
|
|
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
|
|
thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
|
|
thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
|
|
thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
|
|
// Reduce: get Max over x and check for res < Precision
|
|
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
|
|
CUDA_CHECK_ALL_ERRORS();
|
|
converged = (maxX < precision);
|
|
++iterationCount;
|
|
|
|
// Swap pointers, device_x always contains the most current result
|
|
std::swap(device_x, device_xSwap);
|
|
}
|
|
|
|
if (!converged && (iterationCount == maxIterationCount)) {
|
|
iterationCount = 0;
|
|
errorOccured = true;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
std::cout << "(DLL) Finished kernel execution." << std::endl;
|
|
std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
|
|
#endif
|
|
|
|
// Get x back from the device
|
|
cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
|
|
if (cudaCopyResult != cudaSuccess) {
|
|
std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
|
|
errorOccured = true;
|
|
goto cleanup;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
std::cout << "(DLL) Finished copying result data." << std::endl;
|
|
#endif
|
|
|
|
// All code related to freeing memory and clearing up the device
|
|
cleanup:
|
|
if (device_matrixRowIndices != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_matrixRowIndices = nullptr;
|
|
}
|
|
if (device_matrixColIndicesAndValues != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_matrixColIndicesAndValues = nullptr;
|
|
}
|
|
if (device_x != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_x);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_x = nullptr;
|
|
}
|
|
if (device_xSwap != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_xSwap);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_xSwap = nullptr;
|
|
}
|
|
if (device_b != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_b);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_b = nullptr;
|
|
}
|
|
if (device_multiplyResult != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_multiplyResult);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_multiplyResult = nullptr;
|
|
}
|
|
if (device_nondeterministicChoiceIndices != nullptr) {
|
|
cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices);
|
|
if (cudaFreeResult != cudaSuccess) {
|
|
std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl;
|
|
errorOccured = true;
|
|
}
|
|
device_nondeterministicChoiceIndices = nullptr;
|
|
}
|
|
#ifdef DEBUG
|
|
std::cout << "(DLL) Finished cleanup." << std::endl;
|
|
#endif
|
|
|
|
return !errorOccured;
|
|
}
|
|
|
|
template <typename IndexType, typename ValueType>
|
|
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
|
|
IndexType* device_matrixRowIndices = nullptr;
|
|
ValueType* device_matrixColIndicesAndValues = nullptr;
|
|
ValueType* device_x = nullptr;
|
|
ValueType* device_multiplyResult = nullptr;
|
|
|
|
#ifdef DEBUG
|
|
std::cout.sync_with_stdio(true);
|
|
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl;
|
|
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
|
|
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size();
|
|
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
|
|
#endif
|
|
|
|
const IndexType matrixRowCount = matrixRowIndices.size() - 1;
|
|
const IndexType matrixNnzCount = columnIndicesAndValues.size();
|
|
|
|
cudaError_t cudaMallocResult;
< |