Browse Source

Implemented a basic but complete kernel for value iteration in CUDA.

It doesnt work :(


Former-commit-id: 6a3a7aa505
tempestpy_adaptions
PBerger 11 years ago
parent
commit
c0a7e42486
  1. 168
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  2. 8
      resources/cudaForStorm/srcCuda/basicValueIteration.h
  3. 3
      resources/cudaForStorm/srcCuda/cudaForStorm.h
  4. 257
      resources/cudaForStorm/srcCuda/cuspExtension.h
  5. 18
      resources/cudaForStorm/srcCuda/utility.cu
  6. 15
      resources/cudaForStorm/srcCuda/utility.h
  7. 24
      src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
  8. 6
      src/storage/SparseMatrix.h

168
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -1,4 +1,5 @@
#include "basicValueIteration.h"
#define CUSP_USE_TEXTURE_MEMORY
#include <iostream>
#include <chrono>
@ -6,54 +7,111 @@
#include <cuda_runtime.h>
#include "cusparse_v2.h"
#include "utility.h"
#include "cuspExtension.h"
#include <thrust/transform.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#define CUDA_CHECK_ALL_ERRORS() do { \
cudaError_t errSync = cudaGetLastError(); \
cudaError_t errAsync = cudaDeviceSynchronize(); \
if (errSync != cudaSuccess) { \
std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \
} \
if (errAsync != cudaSuccess) { \
std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
} } while(false)
__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
*B = *A;
}
template <typename IndexType, typename ValueType>
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
template<typename T, bool Relative>
struct equalModuloPrecision : public thrust::binary_function<T,T,T>
{
__host__ __device__ T operator()(const T &x, const T &y) const
{
if (Relative) {
const T result = (x - y) / y;
return (result > 0) ? result : -result;
} else {
const T result = (x - y);
return (result > 0) ? result : -result;
}
}
};
template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_xSwap = nullptr;
ValueType* device_b = nullptr;
ValueType* device_multiplyResult = nullptr;
IndexType* device_nondeterministicChoiceIndices = nullptr;
std::cout.sync_with_stdio(true);
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl;
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size();
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl;
const IndexType matrixRowCount = matrixRowIndices.size() - 1;
const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1;
const IndexType matrixNnzCount = columnIndicesAndValues.size();
cudaError_t cudaMallocResult;
cudaMallocResult = cudaMalloc<IndexType>(&device_matrixRowIndices, matrixRowIndices.size());
bool converged = false;
uint_fast64_t iterationCount = 0;
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1));
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<IndexType>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<ValueType>(&device_x, x.size());
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<ValueType>(&device_b, b.size());
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<ValueType>(&device_multiplyResult, b.size());
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<IndexType>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
CUDA_CHECK_ALL_ERRORS();
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixRowCount + 1));
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
@ -62,38 +120,99 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
// Memory allocated, copy data to device
cudaError_t cudaCopyResult;
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice);
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice);
// Preset the xSwap to zeros...
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
// Preset the multiplyResult to zeros...
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
CUDA_CHECK_ALL_ERRORS();
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
// Data is on device, start Kernel
while (!converged && iterationCount < maxIterationCount)
{ // In a sub-area since transfer of control via label evades initialization
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult);
// Transform: Add multiplyResult + b inplace to multiplyResult
thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>());
CUDA_CHECK_ALL_ERRORS();
// Reduce: Reduce multiplyResult to a new x vector
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult);
CUDA_CHECK_ALL_ERRORS();
// Check for convergence
// Transform: x = abs(x - xSwap)/ xSwap
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount);
thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap);
thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
CUDA_CHECK_ALL_ERRORS();
// Reduce: get Max over x and check for res < Precision
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum<ValueType>());
CUDA_CHECK_ALL_ERRORS();
converged = maxX < precision;
++iterationCount;
// Swap pointers, device_x always contains the most current result
std::swap(device_x, device_xSwap);
}
std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl;
// Get x back from the device
cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
// All code related to freeing memory and clearing up the device
cleanup:
@ -118,6 +237,13 @@ cleanup:
}
device_x = nullptr;
}
if (device_xSwap != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_xSwap);
if (cudaFreeResult != cudaSuccess) {
std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl;
}
device_xSwap = nullptr;
}
if (device_b != nullptr) {
cudaError_t cudaFreeResult = cudaFree(device_b);
if (cudaFreeResult != cudaSuccess) {
@ -150,6 +276,18 @@ void cudaForStormTestFunction(int a, int b) {
std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
}
void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
basicValueIteration_mvReduce<uint_fast64_t, double>(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
if (relativePrecisionCheck) {
basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
} else {
basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
}
}
void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
if (relativePrecisionCheck) {
basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
} else {
basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
}
}

8
resources/cudaForStorm/srcCuda/basicValueIteration.h

@ -1,3 +1,6 @@
#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
#include <cstdint>
#include <vector>
#include <utility>
@ -6,4 +9,7 @@
#include "cudaForStorm_Export.h"
cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_

3
resources/cudaForStorm/srcCuda/cudaForStorm.h

@ -8,6 +8,9 @@
// TopologicalValueIteration
#include "srcCuda/basicValueIteration.h"
// Utility Functions
#include "srcCuda/utility.h"

257
resources/cudaForStorm/srcCuda/cuspExtension.h

@ -25,6 +25,8 @@
#pragma once
#include <cusp/detail/device/spmv/csr_vector.h>
#include <limits>
#include <algorithm>
namespace cusp
{
@ -33,6 +35,261 @@ namespace detail
namespace device
{
//////////////////////////////////////////////////////////////////////////////
// CSR SpMV kernels based on a vector model (one warp per row)
//////////////////////////////////////////////////////////////////////////////
//
// spmv_csr_vector_device
// Each row of the CSR matrix is assigned to a warp. The warp computes
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
// the x vector, in parallel. This division of work implies that
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
// manner (but generally not aligned). On GT200 these accesses are
// coalesced, unlike kernels based on the one-row-per-thread division of
// work. Since an entire 32-thread warp is assigned to each row, many
// threads will remain idle when their row contains a small number
// of elements. This code relies on implicit synchronization among
// threads in a warp.
//
// spmv_csr_vector_tex_device
// Same as spmv_csr_vector_tex_device, except that the texture cache is
// used for accessing the x vector.
//
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
__global__ void
storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
{
__shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
for(IndexType row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane];
const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local sum
ValueType sum = 0;
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
// accumulate local sums
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
}
else
{
// accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
}
// store local sum in shared memory
sdata[threadIdx.x] = sum;
// reduce local sums to row sum
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
// first thread writes the result
if (thread_lane == 0)
y[row] = sdata[threadIdx.x];
}
}
template <typename IndexType, typename ValueType, unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
__global__ void
storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer)
{
__shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals
__shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2];
const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW;
const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector
const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index
const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block
const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors
for(IndexType row = vector_id; row < num_rows; row += num_vectors)
{
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the straightforward version
if(thread_lane < 2)
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane];
const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
// initialize local Min/Max
ValueType localMinMaxElement = minMaxInitializer;
if (THREADS_PER_ROW == 32 && row_end - row_start > 32)
{
// ensure aligned memory access to Aj and Ax
IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane;
// accumulate local sums
if(jj >= row_start && jj < row_end) {
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// accumulate local sums
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
else
{
// accumulate local sums
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW)
if(Minimize) {
localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement;
} else {
localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement;
}
}
// store local sum in shared memory
sdata[threadIdx.x] = localMinMaxElement;
// reduce local min/max to row min/max
if (Minimize) {
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
} else {
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement);
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement);
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement);
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement);
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);
}
// first thread writes the result
if (thread_lane == 0)
x[row] = sdata[threadIdx.x];
}
}
template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
{
ValueType __minMaxInitializer = 0;
if (Minimize) {
__minMaxInitializer = std::numeric_limits<ValueType>::max();
}
const ValueType minMaxInitializer = __minMaxInitializer;
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
storm_cuda_opt_vector_reduce_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer);
}
template <bool Minimize, typename IndexType, typename ValueType>
void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
{
const IndexType rows_per_group = num_entries / num_rows;
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; }
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; }
__storm_cuda_opt_vector_reduce<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y);
}
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
if (UseCache)
unbind_x(x);
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
template <typename IndexType, typename ValueType>
void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
__storm_cuda_opt_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
}
// NON-OPT
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{

18
resources/cudaForStorm/srcCuda/utility.cu

@ -1,3 +1,7 @@
#include "utility.h"
#include <cuda_runtime.h>
size_t getFreeCudaMemory() {
size_t freeMemory;
size_t totalMemory;
@ -14,6 +18,16 @@ size_t getTotalCudaMemory() {
return totalMemory;
}
void resetCudaDevice() {
cudaDeviceReset();
bool resetCudaDevice() {
cudaError_t result = cudaDeviceReset();
return (result == cudaSuccess);
}
int getRuntimeCudaVersion() {
int result = -1;
cudaError_t errorResult = cudaRuntimeGetVersion(&result);
if (errorResult != cudaSuccess) {
return -1;
}
return result;
}

15
resources/cudaForStorm/srcCuda/utility.h

@ -1,3 +1,12 @@
size_t getFreeCudaMemory();
size_t getTotalCudaMemory();
void resetCudaDevice();
#ifndef STORM_CUDAFORSTORM_UTILITY_H_
#define STORM_CUDAFORSTORM_UTILITY_H_
// Library exports
#include "cudaForStorm_Export.h"
cudaForStorm_EXPORT size_t getFreeCudaMemory();
cudaForStorm_EXPORT size_t getTotalCudaMemory();
cudaForStorm_EXPORT bool resetCudaDevice();
cudaForStorm_EXPORT int getRuntimeCudaVersion();
#endif // STORM_CUDAFORSTORM_UTILITY_H_

24
src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

@ -13,6 +13,9 @@
#include "log4cplus/loggingmacros.h"
extern log4cplus::Logger logger;
#include "storm-config.h"
#include "cudaForStorm.h"
namespace storm {
namespace solver {
@ -80,6 +83,7 @@ namespace storm {
for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) {
storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt];
std::cout << "SCC Index: " << *sccIndexIt << std::endl;
// Generate a submatrix
storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
@ -121,6 +125,7 @@ namespace storm {
}
// For the current SCC, we need to perform value iteration until convergence.
#ifndef STORM_HAVE_CUDAFORSTORM
localIterations = 0;
converged = false;
while (!converged && localIterations < this->maximalNumberOfIterations) {
@ -157,6 +162,23 @@ namespace storm {
++localIterations;
++globalIterations;
}
std::cout << "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations." << std::endl;
#else
if (!resetCudaDevice()) {
std::cout << "Could not reset CUDA Device!" << std::endl;
}
std::cout << "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl;
size_t memSize = sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size();
std::cout << "We will allocate " << memSize << " Bytes." << std::endl;
std::cout << "The CUDA Runtime Version is " << getRuntimeCudaVersion() << std::endl;
if (minimize) {
basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices);
}
else {
basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices);
}
#endif
// The Result of this SCC has to be taken back into the main result vector
innerIndex = 0;
@ -165,7 +187,7 @@ namespace storm {
++innerIndex;
}
// Since the pointers for swapping in the calculation point to temps they should not be valide anymore
// Since the pointers for swapping in the calculation point to temps they should not be valid anymore
currentX = nullptr;
swap = nullptr;

6
src/storage/SparseMatrix.h

@ -19,6 +19,11 @@ namespace storm {
class EigenAdapter;
class StormAdapter;
}
namespace solver {
template<typename T>
class TopologicalValueIterationNondeterministicLinearEquationSolver;
}
}
namespace storm {
@ -142,6 +147,7 @@ namespace storm {
friend class storm::adapters::GmmxxAdapter;
friend class storm::adapters::EigenAdapter;
friend class storm::adapters::StormAdapter;
friend class storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver<T>;
typedef typename std::vector<std::pair<uint_fast64_t, T>>::iterator iterator;
typedef typename std::vector<std::pair<uint_fast64_t, T>>::const_iterator const_iterator;

Loading…
Cancel
Save