tempest

Added restrict keyword to CUDA kernel. This should enhance compiler optimization.

Refactored TopologicalValueIterationNondeterministicLinearEquationSolver to support "down-casting" to float.
Added better timing output.


Former-commit-id: 688c40decb

tempestpy_adaptions

PBerger 10 years ago

parent

71320239f1

commit

493f93a94b

3 changed files with 283 additions and 336 deletions

Split View

								
									4

resources/cudaForStorm/srcCuda/cuspExtensionDouble.h

									View File
								
					@ -65,7 +65,7 @@ namespace device
				
					template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
				
					__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
				
					__global__ void
				
					storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double * x, double * y)
				
					storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y)
				
					{
				
					    __shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
				
					    __shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2];
				
					@ -135,7 +135,7 @@ storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const
				
					template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize>
				
					__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1)
				
					__global__ void
				
					storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y, const double minMaxInitializer)
				
					storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer)
				
					{
				
					    __shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2];  // padded to avoid reduction conditionals
				
					    __shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2];

								
									566

src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

									View File
								
					@ -1,6 +1,7 @@
				
					#include "src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h"

					#include <utility>

					#include <chrono>

					#include "src/settings/Settings.h"

					#include "src/utility/vector.h"

					@ -42,386 +43,283 @@ namespace storm {
				
							NondeterministicLinearEquationSolver<ValueType>* TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::clone() const {
				
								return new TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>(*this);
				
					        }
				
					        template<typename ValueType>
				
							void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
				
					#ifdef GPU_USE_FLOAT

					#define __FORCE_FLOAT_CALCULATION true

					#else

					#define __FORCE_FLOAT_CALCULATION false

					#endif

								if (__FORCE_FLOAT_CALCULATION && (sizeof(ValueType) == sizeof(double))) {
				
									TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
				
							template<>
				
							void TopologicalValueIterationNondeterministicLinearEquationSolver<float>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<float> const& A, std::vector<float>& x, std::vector<float> const& b, std::vector<float>* multiplyResult, std::vector<float>* newX) const {
				
								// For testing only

								std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
				
								LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>");
				
									storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
				
									std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
				
									std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
				
								// Now, we need to determine the SCCs of the MDP and perform a topological sort.

								std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
				
								storm::models::NonDeterministicMatrixBasedPseudoModel<float> pseudoModel(A, nondeterministicChoiceIndices);
				
								storm::storage::StronglyConnectedComponentDecomposition<float> sccDecomposition(pseudoModel, false, false);
				
								if (sccDecomposition.size() == 0) {
				
									LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
				
									throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
				
								}
				
									tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
				
								storm::storage::SparseMatrix<float> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
				
								std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
				
								// Calculate the optimal distribution of sccs

								std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
				
								LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
				
								std::vector<float>* currentX = nullptr;
				
								std::vector<float>* swap = nullptr;
				
								size_t currentMaxLocalIterations = 0;
				
								size_t localIterations = 0;
				
								size_t globalIterations = 0;
				
								bool converged = true;
				
								// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only

								// solved after all SCCs it depends on have been solved.

								int counter = 0;
				
								for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
				
									bool const useGpu = sccIndexIt->first;
				
									storm::storage::StateBlock const& scc = sccIndexIt->second;
				
									// Generate a sub matrix

									storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
				
									storm::storage::SparseMatrix<float> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
				
									std::vector<float> sccSubB(sccSubmatrix.getRowCount());
				
									storm::utility::vector::selectVectorValues<float>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
				
									std::vector<float> sccSubX(sccSubmatrix.getColumnCount());
				
									std::vector<float> sccSubXSwap(sccSubmatrix.getColumnCount());
				
									std::vector<float> sccMultiplyResult(sccSubmatrix.getRowCount());
				
									// Prepare the pointers for swapping in the calculation

									currentX = &sccSubX;
				
									swap = &sccSubXSwap;
				
									storm::utility::vector::selectVectorValues<float>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)

									std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
				
									sccSubNondeterministicChoiceIndices.at(0) = 0;
				
									// Pre-process all dependent states

									// Remove outgoing transitions and create the ChoiceIndices

									uint_fast64_t innerIndex = 0;
				
									uint_fast64_t outerIndex = 0;
				
									for (uint_fast64_t state : scc) {
				
										// Choice Indices

										sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
				
										for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
				
											storm::storage::SparseMatrix<float>::const_rows row = A.getRow(rowGroupIt);
				
											for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
				
												if (!subMatrixIndices.get(rowIt->getColumn())) {
				
													// This is an outgoing transition of a state in the SCC to a state not included in the SCC

													// Subtracting Pr(tau) * x_other from b fixes that

													sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
				
												}
				
											}
				
											++innerIndex;
				
										}
				
										++outerIndex;
				
									for (size_t i = 0, size = new_x.size(); i < size; ++i) {
				
										x.at(i) = new_x.at(i);
				
									}
				
									return;
				
								}
				
								// For testing only

								if (sizeof(ValueType) == sizeof(double)) {
				
									std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
				
									LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>");
				
								} else {
				
									std::cout << "<<< Using CUDA-FLOAT Kernels >>>" << std::endl;
				
									LOG4CPLUS_INFO(logger, "<<< Using CUDA-FLOAT Kernels >>>");
				
								}
				
									// For the current SCC, we need to perform value iteration until convergence.

									if (useGpu) {
				
								// Now, we need to determine the SCCs of the MDP and perform a topological sort.

								std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
				
								storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> const pseudoModel(A, nondeterministicChoiceIndices);
				
								// Check if the decomposition is necessary

					#ifdef STORM_HAVE_CUDAFORSTORM

										if (!resetCudaDevice()) {
				
											LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
				
											throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
				
										}
				
										//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");

										//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");

										//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());

										bool result = false;
				
										localIterations = 0;
				
										if (minimize) {
				
											result = basicValueIteration_mvReduce_uint64_float_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
										} else {
				
											result = basicValueIteration_mvReduce_uint64_float_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
										}
				
										LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
				
										if (!result) {
				
											converged = false;
				
											LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
				
											throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
				
										} else {
				
											converged = true;
				
										}
				
										// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

										// track of the maximum.

										if (localIterations > currentMaxLocalIterations) {
				
											currentMaxLocalIterations = localIterations;
				
										}
				
					#define __USE_CUDAFORSTORM_OPT true

								size_t const gpuSizeOfCompleteSystem = basicValueIteration_mvReduce_uint64_double_calculateMemorySize(static_cast<size_t>(A.getRowCount()), nondeterministicChoiceIndices.size(), static_cast<size_t>(A.getEntryCount()));
				
								size_t const cudaFreeMemory = static_cast<size_t>(getFreeCudaMemory() * 0.95);
				
					#else

										LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
				
										throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
				
					#define __USE_CUDAFORSTORM_OPT false

								size_t const gpuSizeOfCompleteSystem = 0;
				
								size_t const cudaFreeMemory = 0;
				
					#endif

									} else {
				
										localIterations = 0;
				
										converged = false;
				
										while (!converged && localIterations < this->maximalNumberOfIterations) {
				
											// Compute x' = A*x + b.

											sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
				
											storm::utility::vector::addVectorsInPlace<float>(sccMultiplyResult, sccSubB);
				
											//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);

											//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);

								std::vector<std::pair<bool, storm::storage::StateBlock>> sccDecomposition;
				
								if (__USE_CUDAFORSTORM_OPT && (gpuSizeOfCompleteSystem < cudaFreeMemory)) {
				
									// Dummy output for SCC Times

									std::cout << "Computing the SCC Decomposition took 0ms" << std::endl;
				
											/*

											Versus:
				
											A.multiplyWithVector(*currentX, *multiplyResult);
				
											storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
				
											*/
				
											// Reduce the vector x' by applying min/max for all non-deterministic choices.

											if (minimize) {
				
												storm::utility::vector::reduceVectorMin<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
											} else {
				
												storm::utility::vector::reduceVectorMax<float>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
											}
				
											// Determine whether the method converged.

											// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher

											// running time. In fact, it is faster. This has to be investigated.

											// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);

											converged = storm::utility::vector::equalModuloPrecision<float>(*currentX, *swap, this->precision, this->relative);
				
											// Update environment variables.

											std::swap(currentX, swap);
				
											++localIterations;
				
											++globalIterations;
				
										}
				
										LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
				
					#ifdef STORM_HAVE_CUDAFORSTORM

									if (!resetCudaDevice()) {
				
										LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
				
										throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
				
									}
				
									// The Result of this SCC has to be taken back into the main result vector

									innerIndex = 0;
				
									for (uint_fast64_t state : scc) {
				
										x.at(state) = currentX->at(innerIndex);
				
										++innerIndex;
				
									std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
				
									bool result = false;
				
									size_t globalIterations = 0;
				
									if (minimize) {
				
										result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
				
									} else {
				
										result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, A.rowIndications, A.columnsAndValues, x, b, nondeterministicChoiceIndices, globalIterations);
				
									}
				
									LOG4CPLUS_INFO(logger, "Executed " << globalIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
				
									// Since the pointers for swapping in the calculation point to temps they should not be valid anymore

									currentX = nullptr;
				
									swap = nullptr;
				
									// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

									// track of the maximum.

									if (localIterations > currentMaxLocalIterations) {
				
										currentMaxLocalIterations = localIterations;
				
									bool converged = false;
				
									if (!result) {
				
										converged = false;
				
										LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
				
										throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
				
									} else {
				
										converged = true;
				
									}
				
								}
				
								// Check if the solver converged and issue a warning otherwise.

								if (converged) {
				
									LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
				
								} else {
				
									LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
				
								}
				
							}
				
					        template<typename ValueType>
				
							void TopologicalValueIterationNondeterministicLinearEquationSolver<ValueType>::solveEquationSystem(bool minimize, storm::storage::SparseMatrix<ValueType> const& A, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<ValueType>* multiplyResult, std::vector<ValueType>* newX) const {
				
					#ifndef GPU_USE_DOUBLE

								TopologicalValueIterationNondeterministicLinearEquationSolver<float> tvindles(precision, maximalNumberOfIterations, relative);
				
								storm::storage::SparseMatrix<float> new_A = A.toValueType<float>();
				
								std::vector<float> new_x = storm::utility::vector::toValueType<float>(x);
				
								std::vector<float> const new_b = storm::utility::vector::toValueType<float>(b);
				
									std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
				
									std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
				
								tvindles.solveEquationSystem(minimize, new_A, new_x, new_b, nullptr, nullptr);
				
									std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << globalIterations << " iterations in a single block." << std::endl;
				
								for (size_t i = 0, size = new_x.size(); i < size; ++i) {
				
									x.at(i) = new_x.at(i);
				
								}
				
									// Check if the solver converged and issue a warning otherwise.

									if (converged) {
				
										LOG4CPLUS_INFO(logger, "Iterative solver converged after " << globalIterations << " iterations.");
				
									} else {
				
										LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << globalIterations << " iterations.");
				
									}
				
					#else

								// For testing only

								std::cout << "<<< Using CUDA-DOUBLE Kernels >>>" << std::endl;
				
								LOG4CPLUS_INFO(logger, "<<< Using CUDA-DOUBLE Kernels >>>");
				
								// Now, we need to determine the SCCs of the MDP and perform a topological sort.

								std::vector<uint_fast64_t> const& nondeterministicChoiceIndices = A.getRowGroupIndices();
				
								storm::models::NonDeterministicMatrixBasedPseudoModel<ValueType> pseudoModel(A, nondeterministicChoiceIndices);
				
								storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
				
									LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
				
									throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
				
					#endif

								} else {
				
									std::chrono::high_resolution_clock::time_point sccStartTime = std::chrono::high_resolution_clock::now();
				
									storm::storage::StronglyConnectedComponentDecomposition<ValueType> sccDecomposition(pseudoModel, false, false);
				
								if (sccDecomposition.size() == 0) {
				
									LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
				
									throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
				
								}
				
									if (sccDecomposition.size() == 0) {
				
										LOG4CPLUS_ERROR(logger, "Can not solve given Equation System as the SCC Decomposition returned no SCCs.");
				
										throw storm::exceptions::IllegalArgumentException() << "Can not solve given Equation System as the SCC Decomposition returned no SCCs.";
				
									}
				
								storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
				
								std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
				
								// Calculate the optimal distribution of sccs

								std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
				
								LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
				
								std::vector<ValueType>* currentX = nullptr;
				
								std::vector<ValueType>* swap = nullptr;
				
								size_t currentMaxLocalIterations = 0;
				
								size_t localIterations = 0;
				
								size_t globalIterations = 0;
				
								bool converged = true;
				
								// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only

								// solved after all SCCs it depends on have been solved.

								int counter = 0;
				
								for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
				
									bool const useGpu = sccIndexIt->first;
				
									storm::storage::StateBlock const& scc = sccIndexIt->second;
				
									// Generate a sub matrix

									storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
				
									storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
				
									std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
				
									storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
				
									std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
				
									std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
				
					                std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
				
									// Prepare the pointers for swapping in the calculation

									currentX = &sccSubX;
				
									swap = &sccSubXSwap;
				
									storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)

									std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
				
									sccSubNondeterministicChoiceIndices.at(0) = 0;
				
									// Pre-process all dependent states

									// Remove outgoing transitions and create the ChoiceIndices

									uint_fast64_t innerIndex = 0;
				
					                uint_fast64_t outerIndex = 0;
				
									for (uint_fast64_t state: scc) {
				
										// Choice Indices

										sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
				
										for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
				
											typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt);
				
											for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
				
												if (!subMatrixIndices.get(rowIt->getColumn())) {
				
													// This is an outgoing transition of a state in the SCC to a state not included in the SCC

													// Subtracting Pr(tau) * x_other from b fixes that

													sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
				
									storm::storage::SparseMatrix<ValueType> stronglyConnectedComponentsDependencyGraph = pseudoModel.extractPartitionDependencyGraph(sccDecomposition);
				
									std::vector<uint_fast64_t> topologicalSort = storm::utility::graph::getTopologicalSort(stronglyConnectedComponentsDependencyGraph);
				
									// Calculate the optimal distribution of sccs

									std::vector<std::pair<bool, storm::storage::StateBlock>> optimalSccs = this->getOptimalGroupingFromTopologicalSccDecomposition(sccDecomposition, topologicalSort, A);
				
									LOG4CPLUS_INFO(logger, "Optimized SCC Decomposition, originally " << topologicalSort.size() << " SCCs, optimized to " << optimalSccs.size() << " SCCs.");
				
									std::chrono::high_resolution_clock::time_point sccEndTime = std::chrono::high_resolution_clock::now();
				
									std::cout << "Computing the SCC Decomposition took " << std::chrono::duration_cast<std::chrono::milliseconds>(sccEndTime - sccStartTime).count() << "ms." << std::endl;
				
									std::chrono::high_resolution_clock::time_point calcStartTime = std::chrono::high_resolution_clock::now();
				
									std::vector<ValueType>* currentX = nullptr;
				
									std::vector<ValueType>* swap = nullptr;
				
									size_t currentMaxLocalIterations = 0;
				
									size_t localIterations = 0;
				
									size_t globalIterations = 0;
				
									bool converged = true;
				
									// Iterate over all SCCs of the MDP as specified by the topological sort. This guarantees that an SCC is only

									// solved after all SCCs it depends on have been solved.

									int counter = 0;
				
									for (auto sccIndexIt = optimalSccs.cbegin(); sccIndexIt != optimalSccs.cend() && converged; ++sccIndexIt) {
				
										bool const useGpu = sccIndexIt->first;
				
										storm::storage::StateBlock const& scc = sccIndexIt->second;
				
										// Generate a sub matrix

										storm::storage::BitVector subMatrixIndices(A.getColumnCount(), scc.cbegin(), scc.cend());
				
										storm::storage::SparseMatrix<ValueType> sccSubmatrix = A.getSubmatrix(true, subMatrixIndices, subMatrixIndices);
				
										std::vector<ValueType> sccSubB(sccSubmatrix.getRowCount());
				
										storm::utility::vector::selectVectorValues<ValueType>(sccSubB, subMatrixIndices, nondeterministicChoiceIndices, b);
				
										std::vector<ValueType> sccSubX(sccSubmatrix.getColumnCount());
				
										std::vector<ValueType> sccSubXSwap(sccSubmatrix.getColumnCount());
				
										std::vector<ValueType> sccMultiplyResult(sccSubmatrix.getRowCount());
				
										// Prepare the pointers for swapping in the calculation

										currentX = &sccSubX;
				
										swap = &sccSubXSwap;
				
										storm::utility::vector::selectVectorValues<ValueType>(sccSubX, subMatrixIndices, x); // x is getCols() large, where as b and multiplyResult are getRows() (nondet. choices times states)

										std::vector<uint_fast64_t> sccSubNondeterministicChoiceIndices(sccSubmatrix.getColumnCount() + 1);
				
										sccSubNondeterministicChoiceIndices.at(0) = 0;
				
										// Pre-process all dependent states

										// Remove outgoing transitions and create the ChoiceIndices

										uint_fast64_t innerIndex = 0;
				
										uint_fast64_t outerIndex = 0;
				
										for (uint_fast64_t state : scc) {
				
											// Choice Indices

											sccSubNondeterministicChoiceIndices.at(outerIndex + 1) = sccSubNondeterministicChoiceIndices.at(outerIndex) + (nondeterministicChoiceIndices[state + 1] - nondeterministicChoiceIndices[state]);
				
											for (auto rowGroupIt = nondeterministicChoiceIndices[state]; rowGroupIt != nondeterministicChoiceIndices[state + 1]; ++rowGroupIt) {
				
												typename storm::storage::SparseMatrix<ValueType>::const_rows row = A.getRow(rowGroupIt);
				
												for (auto rowIt = row.begin(); rowIt != row.end(); ++rowIt) {
				
													if (!subMatrixIndices.get(rowIt->getColumn())) {
				
														// This is an outgoing transition of a state in the SCC to a state not included in the SCC

														// Subtracting Pr(tau) * x_other from b fixes that

														sccSubB.at(innerIndex) = sccSubB.at(innerIndex) + (rowIt->getValue() * x.at(rowIt->getColumn()));
				
													}
				
												}
				
												++innerIndex;
				
											}
				
					                        ++innerIndex;
				
											++outerIndex;
				
										}
				
					                    ++outerIndex;
				
									}
				
									// For the current SCC, we need to perform value iteration until convergence.

									if (useGpu) {
				
										// For the current SCC, we need to perform value iteration until convergence.

										if (useGpu) {
				
					#ifdef STORM_HAVE_CUDAFORSTORM

										if (!resetCudaDevice()) {
				
											LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
				
											throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
				
										}
				
											if (!resetCudaDevice()) {
				
												LOG4CPLUS_ERROR(logger, "Could not reset CUDA Device, can not use CUDA Equation Solver.");
				
												throw storm::exceptions::InvalidStateException() << "Could not reset CUDA Device, can not use CUDA Equation Solver.";
				
											}
				
										//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");

										//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");

										//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());

											//LOG4CPLUS_INFO(logger, "Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%).");

											//LOG4CPLUS_INFO(logger, "We will allocate " << (sizeof(uint_fast64_t)* sccSubmatrix.rowIndications.size() + sizeof(uint_fast64_t)* sccSubmatrix.columnsAndValues.size() * 2 + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubX.size() + sizeof(double)* sccSubB.size() + sizeof(double)* sccSubB.size() + sizeof(uint_fast64_t)* sccSubNondeterministicChoiceIndices.size()) << " Bytes.");

											//LOG4CPLUS_INFO(logger, "The CUDA Runtime Version is " << getRuntimeCudaVersion());

										bool result = false;
				
										localIterations = 0;
				
										if (minimize) {
				
											result = basicValueIteration_mvReduce_uint64_double_minimize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
										} else {
				
											result = basicValueIteration_mvReduce_uint64_double_maximize(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
										}
				
										LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
				
										if (!result) {
				
											converged = false;
				
											LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
				
											throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
				
										} else {
				
											converged = true;
				
										}
				
											bool result = false;
				
											localIterations = 0;
				
											if (minimize) {
				
												result = __basicValueIteration_mvReduce_uint64_minimize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
											} else {
				
												result = __basicValueIteration_mvReduce_uint64_maximize<ValueType>(this->maximalNumberOfIterations, this->precision, this->relative, sccSubmatrix.rowIndications, sccSubmatrix.columnsAndValues, *currentX, sccSubB, sccSubNondeterministicChoiceIndices, localIterations);
				
											}
				
											LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations on GPU.");
				
										// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

										// track of the maximum.

										if (localIterations > currentMaxLocalIterations) {
				
											currentMaxLocalIterations = localIterations;
				
										}
				
											if (!result) {
				
												converged = false;
				
												LOG4CPLUS_ERROR(logger, "An error occurred in the CUDA Plugin. Can not continue.");
				
												throw storm::exceptions::InvalidStateException() << "An error occurred in the CUDA Plugin. Can not continue.";
				
											} else {
				
												converged = true;
				
											}
				
											// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

											// track of the maximum.

											if (localIterations > currentMaxLocalIterations) {
				
												currentMaxLocalIterations = localIterations;
				
											}
				
											globalIterations += localIterations;
				
					#else

										LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
				
										throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
				
											LOG4CPLUS_ERROR(logger, "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!");
				
											throw storm::exceptions::InvalidStateException() << "The useGpu Flag of a SCC was set, but this version of StoRM does not support CUDA acceleration. Internal Error!";
				
					#endif

									} else {
				
										localIterations = 0;
				
										converged = false;
				
										while (!converged && localIterations < this->maximalNumberOfIterations) {
				
											// Compute x' = A*x + b.

											sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
				
											storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
				
										} else {
				
											std::cout << "WARNING: Using CPU based TopoSolver! (double)" << std::endl;
				
											localIterations = 0;
				
											converged = false;
				
											while (!converged && localIterations < this->maximalNumberOfIterations) {
				
												// Compute x' = A*x + b.

												sccSubmatrix.multiplyWithVector(*currentX, sccMultiplyResult);
				
												storm::utility::vector::addVectorsInPlace<ValueType>(sccMultiplyResult, sccSubB);
				
												//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);

												//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);

												/*

												Versus:
				
												A.multiplyWithVector(*currentX, *multiplyResult);
				
												storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
				
												*/
				
												// Reduce the vector x' by applying min/max for all non-deterministic choices.

												if (minimize) {
				
													storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
												} else {
				
													storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
												}
				
											//A.multiplyWithVector(scc, nondeterministicChoiceIndices, *currentX, multiplyResult);

											//storm::utility::addVectors(scc, nondeterministicChoiceIndices, multiplyResult, b);

												// Determine whether the method converged.

												// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher

												// running time. In fact, it is faster. This has to be investigated.

												// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);

												converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
				
											/*

											Versus:
				
											A.multiplyWithVector(*currentX, *multiplyResult);
				
											storm::utility::vector::addVectorsInPlace(*multiplyResult, b);
				
											*/
				
												// Update environment variables.

												std::swap(currentX, swap);
				
											// Reduce the vector x' by applying min/max for all non-deterministic choices.

											if (minimize) {
				
												storm::utility::vector::reduceVectorMin<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
												++localIterations;
				
												++globalIterations;
				
											}
				
											else {
				
												storm::utility::vector::reduceVectorMax<ValueType>(sccMultiplyResult, *swap, sccSubNondeterministicChoiceIndices);
				
											}
				
											// Determine whether the method converged.

											// TODO: It seems that the equalModuloPrecision call that compares all values should have a higher

											// running time. In fact, it is faster. This has to be investigated.

											// converged = storm::utility::equalModuloPrecision(*currentX, *newX, scc, precision, relative);

											converged = storm::utility::vector::equalModuloPrecision<ValueType>(*currentX, *swap, this->precision, this->relative);
				
											LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
				
										}
				
											// Update environment variables.

											std::swap(currentX, swap);
				
											++localIterations;
				
											++globalIterations;
				
										// The Result of this SCC has to be taken back into the main result vector

										innerIndex = 0;
				
										for (uint_fast64_t state : scc) {
				
											x.at(state) = currentX->at(innerIndex);
				
											++innerIndex;
				
										}
				
										LOG4CPLUS_INFO(logger, "Executed " << localIterations << " of max. " << maximalNumberOfIterations << " Iterations.");
				
									}
				
										// Since the pointers for swapping in the calculation point to temps they should not be valid anymore

										currentX = nullptr;
				
										swap = nullptr;
				
									// The Result of this SCC has to be taken back into the main result vector

									innerIndex = 0;
				
									for (uint_fast64_t state: scc) {
				
										x.at(state) = currentX->at(innerIndex);
				
										++innerIndex;
				
										// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

										// track of the maximum.

										if (localIterations > currentMaxLocalIterations) {
				
											currentMaxLocalIterations = localIterations;
				
										}
				
									}
				
									// Since the pointers for swapping in the calculation point to temps they should not be valid anymore

									currentX = nullptr;
				
									swap = nullptr;
				
									std::cout << "Used a total of " << globalIterations << " iterations with a maximum of " << localIterations << " iterations in a single block." << std::endl;
				
									// As the "number of iterations" of the full method is the maximum of the local iterations, we need to keep

									// track of the maximum.

									if (localIterations > currentMaxLocalIterations) {
				
										currentMaxLocalIterations = localIterations;
				
									// Check if the solver converged and issue a warning otherwise.

									if (converged) {
				
										LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
				
									} else {
				
										LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
				
									}
				
								}
				
								// Check if the solver converged and issue a warning otherwise.

								if (converged) {
				
									LOG4CPLUS_INFO(logger, "Iterative solver converged after " << currentMaxLocalIterations << " iterations.");
				
								}
				
								else {
				
									LOG4CPLUS_WARN(logger, "Iterative solver did not converged after " << currentMaxLocalIterations << " iterations.");
				
									std::chrono::high_resolution_clock::time_point calcEndTime = std::chrono::high_resolution_clock::now();
				
									std::cout << "Obtaining the fixpoint solution took " << std::chrono::duration_cast<std::chrono::milliseconds>(calcEndTime - calcStartTime).count() << "ms." << std::endl;
				
								}
				
					#endif

					        }
				
							template<typename ValueType>

								
									49

src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h

									View File
								
					@ -8,6 +8,11 @@
				
					#include <utility>
				
					#include <vector>
				
					#include "storm-config.h"
				
					#ifdef STORM_HAVE_CUDAFORSTORM
				
					#	include "cudaForStorm.h"
				
					#endif
				
					namespace storm {
				
					    namespace solver {
				
					@ -42,6 +47,50 @@ namespace storm {
				
								 */
				
								std::vector<std::pair<bool, storm::storage::StateBlock>> getOptimalGroupingFromTopologicalSccDecomposition(storm::storage::StronglyConnectedComponentDecomposition<ValueType> const& sccDecomposition, std::vector<uint_fast64_t> const& topologicalSort, storm::storage::SparseMatrix<ValueType> const& matrix) const;
				
					        };
				
							template <typename ValueType>
				
							bool __basicValueIteration_mvReduce_uint64_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
								//
				
								throw;
				
							}
				
							template <>
				
							inline bool __basicValueIteration_mvReduce_uint64_minimize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
					#ifdef STORM_HAVE_CUDAFORSTORM
				
								return basicValueIteration_mvReduce_uint64_double_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
				
					#else
				
								throw;
				
					#endif
				
							}
				
							template <>
				
							inline bool __basicValueIteration_mvReduce_uint64_minimize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
					#ifdef STORM_HAVE_CUDAFORSTORM
				
								return basicValueIteration_mvReduce_uint64_float_minimize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
				
					#else
				
								throw;
				
					#endif
				
							}
				
							template <typename ValueType>
				
							bool __basicValueIteration_mvReduce_uint64_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
								//
				
								throw;
				
							}
				
							template <>
				
							inline bool __basicValueIteration_mvReduce_uint64_maximize<double>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
					#ifdef STORM_HAVE_CUDAFORSTORM
				
								return basicValueIteration_mvReduce_uint64_double_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
				
					#else
				
								throw;
				
					#endif
				
							}
				
							template <>
				
							inline bool __basicValueIteration_mvReduce_uint64_maximize<float>(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) {
				
					#ifdef STORM_HAVE_CUDAFORSTORM
				
								return basicValueIteration_mvReduce_uint64_float_maximize(maxIterationCount, precision, relativePrecisionCheck, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount);
				
					#else
				
								throw;
				
					#endif
				
							}
				
					    } // namespace solver
				
					} // namespace storm

Added __restrict__ keyword to CUDA kernel. This should enhance compiler optimization.

4 resources/cudaForStorm/srcCuda/cuspExtensionDouble.h View File

566 src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp View File

49 src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h View File

Added restrict keyword to CUDA kernel. This should enhance compiler optimization.

4

resources/cudaForStorm/srcCuda/cuspExtensionDouble.h

View File

566

src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp

View File

49

src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h

View File