From c0a7e424869c3ebddb65e3a3e3a7963f5763567e Mon Sep 17 00:00:00 2001 From: PBerger Date: Tue, 11 Mar 2014 01:41:57 +0100 Subject: [PATCH] Implemented a basic but complete kernel for value iteration in CUDA. It doesnt work :( Former-commit-id: 6a3a7aa505f91adb7034d35f2e0700b59b05718d --- .../srcCuda/basicValueIteration.cu | 168 +++++++++++- .../srcCuda/basicValueIteration.h | 8 +- resources/cudaForStorm/srcCuda/cudaForStorm.h | 3 + .../cudaForStorm/srcCuda/cuspExtension.h | 257 ++++++++++++++++++ resources/cudaForStorm/srcCuda/utility.cu | 18 +- resources/cudaForStorm/srcCuda/utility.h | 15 +- ...onNondeterministicLinearEquationSolver.cpp | 24 +- src/storage/SparseMatrix.h | 6 + 8 files changed, 477 insertions(+), 22 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index f23e7ed09..31fb8d4ba 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -1,4 +1,5 @@ #include "basicValueIteration.h" +#define CUSP_USE_TEXTURE_MEMORY #include #include @@ -6,54 +7,111 @@ #include #include "cusparse_v2.h" +#include "utility.h" + #include "cuspExtension.h" +#include +#include +#include + + +#define CUDA_CHECK_ALL_ERRORS() do { \ + cudaError_t errSync = cudaGetLastError(); \ + cudaError_t errAsync = cudaDeviceSynchronize(); \ + if (errSync != cudaSuccess) { \ + std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ")" << std::endl; \ + } \ + if (errAsync != cudaSuccess) { \ + std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \ + } } while(false) __global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) { *B = *A; } -template -void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { +template +struct equalModuloPrecision : public thrust::binary_function +{ +__host__ __device__ T operator()(const T &x, const T &y) const +{ + if (Relative) { + const T result = (x - y) / y; + return (result > 0) ? result : -result; + } else { + const T result = (x - y); + return (result > 0) ? result : -result; + } +} +}; + +template +void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { IndexType* device_matrixRowIndices = nullptr; IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; + ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; ValueType* device_multiplyResult = nullptr; IndexType* device_nondeterministicChoiceIndices = nullptr; + std::cout.sync_with_stdio(true); + std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory()))*100 << "%)." << std::endl; + size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); + std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; + + const IndexType matrixRowCount = matrixRowIndices.size() - 1; + const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; + const IndexType matrixNnzCount = columnIndicesAndValues.size(); + cudaError_t cudaMallocResult; - cudaMallocResult = cudaMalloc(&device_matrixRowIndices, matrixRowIndices.size()); + bool converged = false; + uint_fast64_t iterationCount = 0; + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_x, x.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_x), sizeof(ValueType) * matrixColCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_b, b.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_xSwap), sizeof(ValueType) * matrixColCount); + if (cudaMallocResult != cudaSuccess) { + std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_b), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_multiplyResult, b.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } - cudaMallocResult = cudaMalloc(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size()); + CUDA_CHECK_ALL_ERRORS(); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixRowCount + 1)); if (cudaMallocResult != cudaSuccess) { std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; @@ -62,38 +120,99 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve // Memory allocated, copy data to device cudaError_t cudaCopyResult; - cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice); + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice); + // Preset the xSwap to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; } - cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice); + // Preset the multiplyResult to zeros... + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } + + CUDA_CHECK_ALL_ERRORS(); + cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; goto cleanup; } // Data is on device, start Kernel + while (!converged && iterationCount < maxIterationCount) + { // In a sub-area since transfer of control via label evades initialization + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + thrust::device_ptr devicePtrThrust_b(device_b); + thrust::device_ptr devicePtrThrust_multiplyResult(device_multiplyResult); + + // Transform: Add multiplyResult + b inplace to multiplyResult + thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus()); + CUDA_CHECK_ALL_ERRORS(); + + // Reduce: Reduce multiplyResult to a new x vector + cusp::detail::device::storm_cuda_opt_vector_reduce(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); + CUDA_CHECK_ALL_ERRORS(); + + // Check for convergence + // Transform: x = abs(x - xSwap)/ xSwap + thrust::device_ptr devicePtrThrust_x(device_x); + thrust::device_ptr devicePtrThrust_x_end(device_x + matrixColCount); + thrust::device_ptr devicePtrThrust_xSwap(device_xSwap); + thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision()); + CUDA_CHECK_ALL_ERRORS(); + // Reduce: get Max over x and check for res < Precision + ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum()); + CUDA_CHECK_ALL_ERRORS(); + converged = maxX < precision; + ++iterationCount; + + // Swap pointers, device_x always contains the most current result + std::swap(device_x, device_xSwap); + } + std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; + + // Get x back from the device + cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); + if (cudaCopyResult != cudaSuccess) { + std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; + goto cleanup; + } // All code related to freeing memory and clearing up the device cleanup: @@ -118,6 +237,13 @@ cleanup: } device_x = nullptr; } + if (device_xSwap != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_xSwap); + if (cudaFreeResult != cudaSuccess) { + std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; + } + device_xSwap = nullptr; + } if (device_b != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_b); if (cudaFreeResult != cudaSuccess) { @@ -150,6 +276,18 @@ void cudaForStormTestFunction(int a, int b) { std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl; } -void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - basicValueIteration_mvReduce(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); +void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + if (relativePrecisionCheck) { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } else { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } +} + +void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { + if (relativePrecisionCheck) { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } else { + basicValueIteration_mvReduce(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices); + } } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.h b/resources/cudaForStorm/srcCuda/basicValueIteration.h index 1316e5014..61529d963 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.h +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h @@ -1,3 +1,6 @@ +#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ +#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ + #include #include #include @@ -6,4 +9,7 @@ #include "cudaForStorm_Export.h" cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b); -cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector const& matrixRowIndices, std::vector> columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); \ No newline at end of file +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); +cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices); + +#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/cudaForStorm.h b/resources/cudaForStorm/srcCuda/cudaForStorm.h index 05e08b987..fdc484eaa 100644 --- a/resources/cudaForStorm/srcCuda/cudaForStorm.h +++ b/resources/cudaForStorm/srcCuda/cudaForStorm.h @@ -8,6 +8,9 @@ // TopologicalValueIteration #include "srcCuda/basicValueIteration.h" +// Utility Functions +#include "srcCuda/utility.h" + diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index 238b3aa36..4b13005f3 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -25,6 +25,8 @@ #pragma once #include +#include +#include namespace cusp { @@ -33,6 +35,261 @@ namespace detail namespace device { +////////////////////////////////////////////////////////////////////////////// +// CSR SpMV kernels based on a vector model (one warp per row) +////////////////////////////////////////////////////////////////////////////// +// +// spmv_csr_vector_device +// Each row of the CSR matrix is assigned to a warp. The warp computes +// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with +// the x vector, in parallel. This division of work implies that +// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous +// manner (but generally not aligned). On GT200 these accesses are +// coalesced, unlike kernels based on the one-row-per-thread division of +// work. Since an entire 32-thread warp is assigned to each row, many +// threads will remain idle when their row contains a small number +// of elements. This code relies on implicit synchronization among +// threads in a warp. +// +// spmv_csr_vector_tex_device +// Same as spmv_csr_vector_tex_device, except that the texture cache is +// used for accessing the x vector. +// +// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] + + +template +__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) +__global__ void +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) +{ + __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals + __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; + + const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; + + const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector + const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index + const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block + const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(IndexType row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; + + const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local sum + ValueType sum = 0; + + if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + + // accumulate local sums + for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + } + else + { + // accumulate local sums + for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) + sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x(matrixColumnIndicesAndValues[2 * jj], x); + } + + // store local sum in shared memory + sdata[threadIdx.x] = sum; + + // reduce local sums to row sum + if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; + if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; + if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; + if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; + if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; + + // first thread writes the result + if (thread_lane == 0) + y[row] = sdata[threadIdx.x]; + } +} + +template +__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) +__global__ void +storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y, const ValueType minMaxInitializer) +{ + __shared__ volatile ValueType sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals + __shared__ volatile IndexType ptrs[ROWS_PER_BLOCK][2]; + + const IndexType THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; + + const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index + const IndexType thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector + const IndexType vector_id = thread_id / THREADS_PER_ROW; // global vector index + const IndexType vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block + const IndexType num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors + + for(IndexType row = vector_id; row < num_rows; row += num_vectors) + { + // use two threads to fetch Ap[row] and Ap[row+1] + // this is considerably faster than the straightforward version + if(thread_lane < 2) + ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; + + const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; + const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; + + // initialize local Min/Max + ValueType localMinMaxElement = minMaxInitializer; + + if (THREADS_PER_ROW == 32 && row_end - row_start > 32) + { + // ensure aligned memory access to Aj and Ax + + IndexType jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; + + // accumulate local sums + if(jj >= row_start && jj < row_end) { + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // accumulate local sums + for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + else + { + // accumulate local sums + for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) + if(Minimize) { + localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; + } else { + localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; + } + } + + // store local sum in shared memory + sdata[threadIdx.x] = localMinMaxElement; + + // reduce local min/max to row min/max + if (Minimize) { + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); + } else { + if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); + if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); + if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); + if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); + if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement); + } + + // first thread writes the result + if (thread_lane == 0) + x[row] = sdata[threadIdx.x]; + } +} + +template +void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) +{ + ValueType __minMaxInitializer = 0; + if (Minimize) { + __minMaxInitializer = std::numeric_limits::max(); + } + const ValueType minMaxInitializer = __minMaxInitializer; + + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + storm_cuda_opt_vector_reduce_kernel <<>> + (num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); +} + +template +void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_entries, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) +{ + const IndexType rows_per_group = num_entries / num_rows; + + if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); return; } + + __storm_cuda_opt_vector_reduce(num_rows, nondeterministicChoiceIndices, x, y); +} + +template +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const size_t THREADS_PER_BLOCK = 128; + const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; + + const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); + const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); + + if (UseCache) + bind_x(x); + + storm_cuda_opt_spmv_csr_vector_kernel <<>> + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); + + if (UseCache) + unbind_x(x); +} + +template +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +template +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) +{ + const IndexType nnz_per_row = num_entries / num_rows; + + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); +} + +// NON-OPT + template void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) { diff --git a/resources/cudaForStorm/srcCuda/utility.cu b/resources/cudaForStorm/srcCuda/utility.cu index 9366453f9..99165ba07 100644 --- a/resources/cudaForStorm/srcCuda/utility.cu +++ b/resources/cudaForStorm/srcCuda/utility.cu @@ -1,3 +1,7 @@ +#include "utility.h" + +#include + size_t getFreeCudaMemory() { size_t freeMemory; size_t totalMemory; @@ -14,6 +18,16 @@ size_t getTotalCudaMemory() { return totalMemory; } -void resetCudaDevice() { - cudaDeviceReset(); +bool resetCudaDevice() { + cudaError_t result = cudaDeviceReset(); + return (result == cudaSuccess); +} + +int getRuntimeCudaVersion() { + int result = -1; + cudaError_t errorResult = cudaRuntimeGetVersion(&result); + if (errorResult != cudaSuccess) { + return -1; + } + return result; } \ No newline at end of file diff --git a/resources/cudaForStorm/srcCuda/utility.h b/resources/cudaForStorm/srcCuda/utility.h index ed25af9b6..f3110fbeb 100644 --- a/resources/cudaForStorm/srcCuda/utility.h +++ b/resources/cudaForStorm/srcCuda/utility.h @@ -1,3 +1,12 @@ -size_t getFreeCudaMemory(); -size_t getTotalCudaMemory(); -void resetCudaDevice(); \ No newline at end of file +#ifndef STORM_CUDAFORSTORM_UTILITY_H_ +#define STORM_CUDAFORSTORM_UTILITY_H_ + +// Library exports +#include "cudaForStorm_Export.h" + +cudaForStorm_EXPORT size_t getFreeCudaMemory(); +cudaForStorm_EXPORT size_t getTotalCudaMemory(); +cudaForStorm_EXPORT bool resetCudaDevice(); +cudaForStorm_EXPORT int getRuntimeCudaVersion(); + +#endif // STORM_CUDAFORSTORM_UTILITY_H_ \ No newline at end of file diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index f6c4473fa..14f94f807 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -13,6 +13,9 @@ #include "log4cplus/loggingmacros.h" extern log4cplus::Logger logger; +#include "storm-config.h" +#include "cudaForStorm.h" + namespace storm { namespace solver { @@ -80,6 +83,7 @@ namespace storm { for (auto sccIndexIt = topologicalSort.begin(); sccIndexIt != topologicalSort.end() && converged; ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + std::cout << "SCC Index: " << *sccIndexIt << std::endl; // Generate a submatrix storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend()); @@ -121,6 +125,7 @@ namespace storm { } // For the current SCC, we need to perform value iteration until convergence. +#ifndef STORM_HAVE_CUDAFORSTORM localIterations = 0; converged = false; while (!converged && localIterations < this->maximalNumberOfIterations) { @@ -157,6 +162,23 @@ namespace storm { ++localIterations; ++globalIterations; } + std::cout << "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations." << std::endl; +#else + if (!resetCudaDevice()) { + std::cout << "Could not reset CUDA Device!" << std::endl; + } + std::cout << "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast(getFreeCudaMemory()) / static_cast(getTotalCudaMemory())) * 100 << "%)." << std::endl; + size_t memSize = sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size(); + std::cout << "We will allocate " << memSize << " Bytes." << std::endl; + std::cout << "The CUDA Runtime Version is " << getRuntimeCudaVersion() << std::endl; + + if (minimize) { + basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + } + else { + basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices); + } +#endif // The Result of this SCC has to be taken back into the main result vector innerIndex = 0; @@ -165,7 +187,7 @@ namespace storm { ++innerIndex; } - // Since the pointers for swapping in the calculation point to temps they should not be valide anymore + // Since the pointers for swapping in the calculation point to temps they should not be valid anymore currentX = nullptr; swap = nullptr; diff --git a/src/storage/SparseMatrix.h b/src/storage/SparseMatrix.h index dbb80745b..cac8ae586 100644 --- a/src/storage/SparseMatrix.h +++ b/src/storage/SparseMatrix.h @@ -19,6 +19,11 @@ namespace storm { class EigenAdapter; class StormAdapter; } + + namespace solver { + template + class TopologicalValueIterationNondeterministicLinearEquationSolver; + } } namespace storm { @@ -142,6 +147,7 @@ namespace storm { friend class storm::adapters::GmmxxAdapter; friend class storm::adapters::EigenAdapter; friend class storm::adapters::StormAdapter; + friend class storm::solver::TopologicalValueIterationNondeterministicLinearEquationSolver; typedef typename std::vector>::iterator iterator; typedef typename std::vector>::const_iterator const_iterator;