From 26500ff4a8f0f4a841803d62aa71e7cfbde93ee6 Mon Sep 17 00:00:00 2001 From: PBerger Date: Wed, 19 Mar 2014 03:54:31 +0100 Subject: [PATCH] Refactored the CUDA Kernel to once again use the "hacked" combination of column indices and values with a bit of reinterpret_cast magic. Refactored the CUDA-SCC grouping algorithm as is took 80x longer to calculate the groups than it took to calculate the entire solution. Former-commit-id: 5a5ffabe388f9220bb900420e771a43f256d93b4 --- .../srcCuda/basicValueIteration.cu | 94 +++++-------------- .../cudaForStorm/srcCuda/cuspExtension.h | 36 +++---- ...onNondeterministicLinearEquationSolver.cpp | 91 ++++++++++++------ 3 files changed, 104 insertions(+), 117 deletions(-) diff --git a/resources/cudaForStorm/srcCuda/basicValueIteration.cu b/resources/cudaForStorm/srcCuda/basicValueIteration.cu index 231c6c4be..b7a879e8b 100644 --- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu +++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu @@ -58,13 +58,12 @@ void exploadVector(std::vector> const& inputVect template void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector& x, std::vector const& b, std::vector const& nondeterministicChoiceIndices) { - std::vector matrixColumnIndices; - std::vector matrixValues; - exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); + //std::vector matrixColumnIndices; + //std::vector matrixValues; + //exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_xSwap = nullptr; ValueType* device_b = nullptr; @@ -96,16 +95,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -159,16 +151,9 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy } CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); - if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); + cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); if (cudaCopyResult != cudaSuccess) { - std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl; + std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; goto cleanup; } @@ -216,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy // Data is on device, start Kernel while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization - cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); thrust::device_ptr devicePtrThrust_b(device_b); @@ -272,19 +257,12 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixColIndices = nullptr; - } - if (device_matrixValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixValues); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixValues = nullptr; + device_matrixColIndicesAndValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); @@ -328,13 +306,8 @@ cleanup: template void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector const& matrixRowIndices, std::vector> const& columnIndicesAndValues, std::vector const& x, std::vector& b) { - std::vector matrixColumnIndices; - std::vector matrixValues; - exploadVector(columnIndicesAndValues, matrixColumnIndices, matrixValues); - IndexType* device_matrixRowIndices = nullptr; - IndexType* device_matrixColIndices = nullptr; - ValueType* device_matrixValues = nullptr; + IndexType* device_matrixColIndicesAndValues = nullptr; ValueType* device_x = nullptr; ValueType* device_multiplyResult = nullptr; @@ -359,16 +332,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount); - if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl; - goto cleanup; - } - - CUDA_CHECK_ALL_ERRORS(); - cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixValues), sizeof(ValueType) * matrixNnzCount); + cudaMallocResult = cudaMalloc(reinterpret_cast(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); if (cudaMallocResult != cudaSuccess) { - std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl; + std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; goto cleanup; } @@ -401,16 +367,9 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult); + cusp::detail::device::storm_cuda_opt_spmv_csr_vector(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); CUDA_CHECK_ALL_ERRORS(); #ifdef DEBUG @@ -460,19 +419,12 @@ cleanup: } device_matrixRowIndices = nullptr; } - if (device_matrixColIndices != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices); - if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl; - } - device_matrixColIndices = nullptr; - } - if (device_matrixValues != nullptr) { - cudaError_t cudaFreeResult = cudaFree(device_matrixValues); + if (device_matrixColIndicesAndValues != nullptr) { + cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); if (cudaFreeResult != cudaSuccess) { - std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl; + std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; } - device_matrixValues = nullptr; + device_matrixColIndicesAndValues = nullptr; } if (device_x != nullptr) { cudaError_t cudaFreeResult = cudaFree(device_x); diff --git a/resources/cudaForStorm/srcCuda/cuspExtension.h b/resources/cudaForStorm/srcCuda/cuspExtension.h index f07849816..7c9a9fb28 100644 --- a/resources/cudaForStorm/srcCuda/cuspExtension.h +++ b/resources/cudaForStorm/srcCuda/cuspExtension.h @@ -61,7 +61,7 @@ namespace device template __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) __global__ void -storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y) +storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y) { __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]; @@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType // accumulate local sums if(jj >= row_start && jj < row_end) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); // accumulate local sums for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); } else { // accumulate local sums for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) - sum += matrixValues[jj] * fetch_x(matrixColumnIndices[jj], x); + sum += reinterpret_cast(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x(matrixColumnIndicesAndValues[2*jj], x); } // store local sum in shared memory @@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_ } template -void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const size_t THREADS_PER_BLOCK = 128; const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; @@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType bind_x(x); storm_cuda_opt_spmv_csr_vector_kernel <<>> - (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); if (UseCache) unbind_x(x); } template -void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } template -void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y) +void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { const IndexType nnz_per_row = num_entries / num_rows; - if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } - if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } + if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } + if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } - __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); + __storm_cuda_opt_spmv_csr_vector(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); } // NON-OPT diff --git a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp index cdc468936..fa68468ec 100644 --- a/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp +++ b/src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp @@ -1,7 +1,6 @@ #include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h" #include -#include #include "src/settings/Settings.h" #include "src/utility/vector.h" @@ -46,7 +45,6 @@ namespace storm { template void TopologicalValueIterationNondeterministicLinearEquationSolver::solveEquationSystem(bool minimize, storm::storage::SparseMatrix const& A, std::vector& x, std::vector const& b, std::vector* multiplyResult, std::vector* newX) const { - // Now, we need to determine the SCCs of the MDP and perform a topological sort. std::vector const& nondeterministicChoiceIndices = A.getRowGroupIndices(); storm::models::NonDeterministicMatrixBasedPseudoModel pseudoModel(A, nondeterministicChoiceIndices); @@ -219,57 +217,94 @@ namespace storm { size_t lastResultIndex = 0; std::vector const& rowGroupIndices = matrix.getRowGroupIndices(); + + size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(matrix.getRowCount()), rowGroupIndices.size(), static_cast(matrix.getEntryCount())); + size_t const gpuSizePerRowGroup = std::max(static_cast(gpuSizeOfCompleteSystem / rowGroupIndices.size()), static_cast(1)); + size_t const maxRowGroupsPerMemory = cudaFreeMemory / gpuSizePerRowGroup; + size_t currentSize = 0; - for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { - storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; + size_t neededReserveSize = 0; + size_t startIndex = 0; + for (size_t i = 0; i < topologicalSort.size(); ++i) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[i]]; + size_t const currentSccSize = scc.size(); uint_fast64_t rowCount = 0; uint_fast64_t entryCount = 0; - storm::storage::StateBlock rowGroups; - rowGroups.reserve(scc.size()); for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { rowCount += matrix.getRowGroupSize(*sccIt); entryCount += matrix.getRowGroupEntryCount(*sccIt); - rowGroups.insert(*sccIt); } size_t sccSize = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast(rowCount), scc.size(), static_cast(entryCount)); if ((currentSize + sccSize) <= cudaFreeMemory) { // There is enough space left in the current group - - if (currentSize == 0) { - result.push_back(std::make_pair(true, rowGroups)); - } - else { - result[lastResultIndex].second.insert(rowGroups.begin(), rowGroups.end()); - } + neededReserveSize += currentSccSize; currentSize += sccSize; - } - else { - if (sccSize <= cudaFreeMemory) { + } else { + // This would make the last open group to big for the GPU + + if (startIndex < i) { + if ((startIndex + 1) < i) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + for (size_t j = startIndex; j < i; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); + } + std::sort(tempGroups.begin(), tempGroups.end()); + + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); + } else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } ++lastResultIndex; - result.push_back(std::make_pair(true, rowGroups)); - currentSize = sccSize; } - else { + + if (sccSize <= cudaFreeMemory) { + currentSize = sccSize; + neededReserveSize = currentSccSize; + startIndex = i; + } else { // This group is too big to fit into the CUDA Memory by itself - lastResultIndex += 2; - result.push_back(std::make_pair(false, rowGroups)); + result.push_back(std::make_pair(false, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[i]])))); + ++lastResultIndex; + currentSize = 0; + neededReserveSize = 0; + startIndex = i + 1; + } + } + } + + size_t const topologicalSortSize = topologicalSort.size(); + if (startIndex < topologicalSortSize) { + if ((startIndex + 1) < topologicalSortSize) { + // More than one component + std::vector tempGroups; + tempGroups.reserve(neededReserveSize); + for (size_t j = startIndex; j < topologicalSortSize; ++j) { + storm::storage::StateBlock const& scc = sccDecomposition[topologicalSort[j]]; + tempGroups.insert(tempGroups.cend(), scc.cbegin(), scc.cend()); } + std::sort(tempGroups.begin(), tempGroups.end()); + + result.push_back(std::make_pair(true, storm::storage::StateBlock(boost::container::ordered_unique_range, tempGroups.cbegin(), tempGroups.cend()))); } + else { + // Only one group, copy construct. + result.push_back(std::make_pair(true, storm::storage::StateBlock(std::move(sccDecomposition[topologicalSort[startIndex]])))); + } + ++lastResultIndex; } #else for (auto sccIndexIt = topologicalSort.cbegin(); sccIndexIt != topologicalSort.cend(); ++sccIndexIt) { storm::storage::StateBlock const& scc = sccDecomposition[*sccIndexIt]; - storm::storage::StateBlock rowGroups; - rowGroups.reserve(scc.size()); - for (auto sccIt = scc.cbegin(); sccIt != scc.cend(); ++sccIt) { - rowGroups.insert(*sccIt); - } - result.push_back(std::make_pair(false, rowGroups)); + result.push_back(std::make_pair(false, scc)); } #endif return result;