Added Tests to the Cuda Plugin.

Refactored kernel for SpMV to use two vectors for column indexes and values. Former-commit-id: 3560d3cc9a
12 years ago · 208005e68b
4 changed files with 485 additions and 53 deletions
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.cu
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.cu
@ -25,9 +25,6 @@
 		std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ")" << std::endl; \
 	} } while(false)

-__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
-	*B = *A;
-}

 template<typename T, bool Relative>
 struct equalModuloPrecision : public thrust::binary_function<T,T,T>
@ -36,18 +33,34 @@ __host__ __device__ T operator()(const T &x, const T &y) const
 {
    if (Relative) {
 		const T result = (x - y) / y;
-		return (result > 0) ? result : -result;
+		return ((result >= 0) ? (result) : (-result));
    } else {
        const T result = (x - y);
-		return (result > 0) ? result : -result;
+		return ((result >= 0) ? (result) : (-result));
    }
 }
 };

+template<typename IndexType, typename ValueType>
+void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) {
+	indexVector.reserve(inputVector.size());
+	valueVector.reserve(inputVector.size());
+	for (size_t i = 0; i < inputVector.size(); ++i) {
+		indexVector.push_back(inputVector.at(i).first);
+		valueVector.push_back(inputVector.at(i).second);
+	}
+}
+
 template <bool Minimize, bool Relative, typename IndexType, typename ValueType>
 void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueType const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
+	
+	std::vector<IndexType> matrixColumnIndices;
+	std::vector<ValueType> matrixValues;
+	exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
+	
 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndicesAndValues = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_xSwap = nullptr;
 	ValueType* device_b = nullptr;
@ -76,9 +89,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+		std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

@ -128,9 +148,16 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+		std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

@ -174,7 +201,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 	// Data is on device, start Kernel
 	while (!converged && iterationCount < maxIterationCount)
 	{ // In a sub-area since transfer of control via label evades initialization
-		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+		cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
 		CUDA_CHECK_ALL_ERRORS();

 		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
@ -197,11 +224,14 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, ValueTy
 		CUDA_CHECK_ALL_ERRORS();

 		// Reduce: get Max over x and check for res < Precision
-		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, 0, thrust::maximum<ValueType>());
+		ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
 		CUDA_CHECK_ALL_ERRORS();
-		converged = maxX < precision;
+		converged = (maxX < precision);
 		++iterationCount;

+		// If there are empty rows in the matrix we need to clear multiplyResult
+		thrust::fill(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, 0);
+
 		// Swap pointers, device_x always contains the most current result
 		std::swap(device_x, device_xSwap);
 	}
@ -223,12 +253,19 @@ cleanup:
 		}
 		device_matrixRowIndices = nullptr;
 	}
-	if (device_matrixColIndicesAndValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
+	if (device_matrixColIndices != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_matrixColIndices = nullptr;
+	}
+	if (device_matrixValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
 		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
+			std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
 		}
-		device_matrixColIndicesAndValues = nullptr;
+		device_matrixValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
@ -269,8 +306,13 @@ cleanup:

 template <typename IndexType, typename ValueType>
 void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) {
+	std::vector<IndexType> matrixColumnIndices;
+	std::vector<ValueType> matrixValues;
+	exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues);
+	
 	IndexType* device_matrixRowIndices = nullptr;
-	IndexType* device_matrixColIndicesAndValues = nullptr;
+	IndexType* device_matrixColIndices = nullptr;
+	ValueType* device_matrixValues = nullptr;
 	ValueType* device_x = nullptr;
 	ValueType* device_multiplyResult = nullptr;

@ -292,9 +334,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount);
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndices), sizeof(IndexType) * matrixNnzCount);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Matrix Column Indices, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixValues), sizeof(ValueType) * matrixNnzCount);
 	if (cudaMallocResult != cudaSuccess) {
-		std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
+		std::cout << "Could not allocate memory for Matrix Values, Error Code " << cudaMallocResult << "." << std::endl;
 		goto cleanup;
 	}

@ -323,9 +372,16 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 	}

 	CUDA_CHECK_ALL_ERRORS();
-	cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	cudaCopyResult = cudaMemcpy(device_matrixColIndices, matrixColumnIndices.data(), (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Matrix Column Indices, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_matrixValues, matrixValues.data(), (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice);
 	if (cudaCopyResult != cudaSuccess) {
-		std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
+		std::cout << "Could not copy data for Matrix Values, Error Code " << cudaCopyResult << std::endl;
 		goto cleanup;
 	}

@ -344,7 +400,7 @@ void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<In
 		goto cleanup;
 	}

-	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult);
+	cusp::detail::device::storm_cuda_opt_spmv_csr_vector<IndexType, ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndices, device_matrixValues, device_x, device_multiplyResult);
 	CUDA_CHECK_ALL_ERRORS();

 	// Get result back from the device
@ -363,12 +419,19 @@ cleanup:
 		}
 		device_matrixRowIndices = nullptr;
 	}
-	if (device_matrixColIndicesAndValues != nullptr) {
-		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues);
+	if (device_matrixColIndices != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixColIndices);
 		if (cudaFreeResult != cudaSuccess) {
-			std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl;
+			std::cout << "Could not free Memory of Matrix Column Indices, Error Code " << cudaFreeResult << "." << std::endl;
 		}
-		device_matrixColIndicesAndValues = nullptr;
+		device_matrixColIndices = nullptr;
+	}
+	if (device_matrixValues != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_matrixValues);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Matrix Values, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_matrixValues = nullptr;
 	}
 	if (device_x != nullptr) {
 		cudaError_t cudaFreeResult = cudaFree(device_x);
@ -386,19 +449,266 @@ cleanup:
 	}
 }

+template <typename ValueType>
+void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) {
+	ValueType* device_a = nullptr;
+	ValueType* device_b = nullptr;
+
+	const size_t vectorSize = std::max(a.size(), b.size());
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Transform: Add multiplyResult + b inplace to multiplyResult
+		thrust::device_ptr<ValueType> devicePtrThrust_a(device_a);
+		thrust::device_ptr<ValueType> devicePtrThrust_b(device_b);
+		thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// Get result back from the device
+	cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_a != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_a);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_a = nullptr;
+	}
+	if (device_b != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_b);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_b = nullptr;
+	}
+}
+
+template <typename IndexType, typename ValueType, bool Minimize>
+void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) {
+	ValueType* device_groupedVector = nullptr;
+	IndexType* device_grouping = nullptr;
+	ValueType* device_target = nullptr;
+
+	const size_t groupedSize = groupedVector.size();
+	const size_t groupingSize = grouping.size();
+	const size_t targetSize = targetVector.size();
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Reduce: Reduce multiplyResult to a new x vector
+		cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, IndexType, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector);
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// Get result back from the device
+	cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_groupedVector != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_groupedVector);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_groupedVector = nullptr;
+	}
+	if (device_grouping != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_grouping);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_grouping = nullptr;
+	}
+	if (device_target != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_target);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_target = nullptr;
+	}
+}
+
+template <typename ValueType, bool Relative>
+void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) {
+	ValueType* device_x = nullptr;
+	ValueType* device_y = nullptr;
+
+	const size_t vectorSize = x.size();
+
+	cudaError_t cudaMallocResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize);
+	if (cudaMallocResult != cudaSuccess) {
+		std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl;
+		goto cleanup;
+	}
+
+	// Memory allocated, copy data to device
+	cudaError_t cudaCopyResult;
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+
+	CUDA_CHECK_ALL_ERRORS();
+	cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice);
+	if (cudaCopyResult != cudaSuccess) {
+		std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl;
+		goto cleanup;
+	}
+	
+	do {
+		// Transform: x = abs(x - xSwap)/ xSwap
+		thrust::device_ptr<ValueType> devicePtrThrust_x(device_x);
+		thrust::device_ptr<ValueType> devicePtrThrust_y(device_y);
+		thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>());
+		CUDA_CHECK_ALL_ERRORS();
+
+		// Reduce: get Max over x and check for res < Precision
+		maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>());
+		CUDA_CHECK_ALL_ERRORS();
+	} while (false);
+
+	// All code related to freeing memory and clearing up the device
+cleanup:
+	if (device_x != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_x);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_x = nullptr;
+	}
+	if (device_y != nullptr) {
+		cudaError_t cudaFreeResult = cudaFree(device_y);
+		if (cudaFreeResult != cudaSuccess) {
+			std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl;
+		}
+		device_y = nullptr;
+	}
+}
+
 /*
 * Declare and implement all exported functions for these Kernels here
 *
 */

-void cudaForStormTestFunction(int a, int b) {
-	std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
-}
-
 void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) {
 	basicValueIteration_spmv(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b);
 }

+void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) {
+	basicValueIteration_addVectorsInplace<double>(a, b);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) {
+	basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector);
+}
+
+void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
+	basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement);
+}
+
+void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) {
+	basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement);
+}
+
 void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
 	if (relativePrecisionCheck) {
 		basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
--- a/resources/cudaForStorm/srcCuda/basicValueIteration.h
+++ b/resources/cudaForStorm/srcCuda/basicValueIteration.h
@ -8,9 +8,13 @@
 // Library exports
 #include "cudaForStorm_Export.h"

-cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
 cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
 cudaForStorm_EXPORT void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b);
+cudaForStorm_EXPORT void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);
+cudaForStorm_EXPORT void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement);

 #endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_
--- a/resources/cudaForStorm/srcCuda/cuspExtension.h
+++ b/resources/cudaForStorm/srcCuda/cuspExtension.h
@ -61,7 +61,7 @@ namespace device
 template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache>
 __launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
 __global__ void
-storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType * x, ValueType * y)
+storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType * x, ValueType * y)
 {
    __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
    __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
@ -95,17 +95,17 @@ storm_cuda_opt_spmv_csr_vector_kernel(const IndexType num_rows, const IndexType

            // accumulate local sums
            if(jj >= row_start && jj < row_end)
-                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);

            // accumulate local sums
            for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
        }
        else
        {
            // accumulate local sums
            for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
-                sum += matrixColumnIndicesAndValues[(2 * jj) + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2 * jj], x);
+                sum += matrixValues[jj] * fetch_x<UseCache>(matrixColumnIndices[jj], x);
        }

        // store local sum in shared memory
@ -214,7 +214,7 @@ storm_cuda_opt_vector_reduce_kernel(const IndexType num_rows, const IndexType *
 template <bool Minimize, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
 void __storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType * nondeterministicChoiceIndices, ValueType * x, const ValueType * y)
 {
-	ValueType __minMaxInitializer = 0;
+	ValueType __minMaxInitializer = -std::numeric_limits<ValueType>::max();
 	if (Minimize) {
 		__minMaxInitializer = std::numeric_limits<ValueType>::max();
 	}
@ -244,7 +244,7 @@ void storm_cuda_opt_vector_reduce(const IndexType num_rows, const IndexType num_
 }

 template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
-void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const size_t THREADS_PER_BLOCK  = 128;
    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
@ -256,36 +256,36 @@ void __storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType
        bind_x(x);

    storm_cuda_opt_spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> 
-        (num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+        (num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);

    if (UseCache)
        unbind_x(x);
 }

 template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+void storm_cuda_opt_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;

-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<false, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<false, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<false, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<false,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
    
-    __storm_cuda_opt_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+    __storm_cuda_opt_spmv_csr_vector<false,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }

 template <typename IndexType, typename ValueType>
-void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y)
+void storm_cuda_opt_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
 {
    const IndexType nnz_per_row = num_entries / num_rows;

-    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
-    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; }
+    if (nnz_per_row <=  2) { __storm_cuda_opt_spmv_csr_vector<true, 2, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  4) { __storm_cuda_opt_spmv_csr_vector<true, 4, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <=  8) { __storm_cuda_opt_spmv_csr_vector<true, 8, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
+    if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector<true,16, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }

-    __storm_cuda_opt_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y);
+    __storm_cuda_opt_spmv_csr_vector<true,32, IndexType, ValueType>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
 }

 // NON-OPT
--- a/test/functional/solver/CudaPluginTest.cpp
+++ b/test/functional/solver/CudaPluginTest.cpp
@ -9,7 +9,7 @@

 #include "cudaForStorm.h"

-TEST(CudaPlugin, CreationWithDimensions) {
+TEST(CudaPlugin, SpMV_4x4) {
    storm::storage::SparseMatrixBuilder<double> matrixBuilder(4, 4, 10);
    ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 1, 1.0));
 	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 3, -1.0));
@ -41,7 +41,7 @@ TEST(CudaPlugin, CreationWithDimensions) {
 	ASSERT_EQ(b.at(3), 0);
 }

-TEST(CudaPlugin, VerySmall) {
+TEST(CudaPlugin, SpMV_VerySmall) {
 	storm::storage::SparseMatrixBuilder<double> matrixBuilder(2, 2, 2);
 	ASSERT_NO_THROW(matrixBuilder.addNextValue(0, 0, 1.0));
 	ASSERT_NO_THROW(matrixBuilder.addNextValue(1, 1, 2.0));
@ -62,4 +62,122 @@ TEST(CudaPlugin, VerySmall) {
 	ASSERT_EQ(b.at(1), 16.0);
 }

+TEST(CudaPlugin, AddVectorsInplace) {
+	std::vector<double> vectorA_1 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorA_2 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorA_3 = { 0.0, 42.0, 21.4, 3.1415, 1.0, 7.3490390, 94093053905390.21, -0.000000000023 };
+	std::vector<double> vectorB = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+	std::vector<double> vectorC = { -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0, -5000.0 };
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_1, vectorB));
+	ASSERT_NO_THROW(basicValueIteration_addVectorsInplace_double(vectorA_2, vectorC));
+
+	ASSERT_EQ(vectorA_1.size(), 8);
+	ASSERT_EQ(vectorA_2.size(), 8);
+	ASSERT_EQ(vectorA_3.size(), 8);
+	ASSERT_EQ(vectorB.size(), 8);
+	ASSERT_EQ(vectorC.size(), 8);
+
+	for (size_t i = 0; i < vectorA_3.size(); ++i) {
+		double cpu_result_b = vectorA_3.at(i) + vectorB.at(i);
+		double cpu_result_c = vectorA_3.at(i) + vectorC.at(i);
+
+		ASSERT_EQ(cpu_result_b, vectorA_1.at(i));
+		ASSERT_EQ(cpu_result_c, vectorA_2.at(i));
+	}
+}
+
+TEST(CudaPlugin, ReduceGroupedVector) {
+	std::vector<double> groupedVector = {
+		0.0, -1000.0, 0.000004, // Group 0
+		5.0,					// Group 1
+		0.0, 1.0, 2.0, 3.0,		// Group 2
+		-1000.0, -3.14, -0.0002,// Group 3 (neg only)
+		25.25, 25.25, 25.25,	// Group 4
+		0.0, 0.0, 1.0,			// Group 5
+		-0.000001, 0.000001		// Group 6
+	};
+	std::vector<uint_fast64_t> grouping = {
+		0, 3, 4, 8, 11, 14, 17, 19
+	};
+
+	std::vector<double> result_minimize = {
+		-1000.0, // Group 0
+		5.0,
+		0.0,
+		-1000.0,
+		25.25,
+		0.0,
+		-0.000001
+	};
+	std::vector<double> result_maximize = {
+		0.000004,
+		5.0,
+		3.0,
+		-0.0002,
+		25.25,
+		1.0,
+		0.000001
+	};
+
+	std::vector<double> result_cuda_minimize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+	std::vector<double> result_cuda_maximize = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_minimize(groupedVector, grouping, result_cuda_minimize));
+	ASSERT_NO_THROW(basicValueIteration_reduceGroupedVector_uint64_double_maximize(groupedVector, grouping, result_cuda_maximize));
+
+	for (size_t i = 0; i < result_minimize.size(); ++i) {
+		ASSERT_EQ(result_minimize.at(i), result_cuda_minimize.at(i));
+		ASSERT_EQ(result_maximize.at(i), result_cuda_maximize.at(i));
+	}
+}
+
+TEST(CudaPlugin, equalModuloPrecision) {
+	std::vector<double> x = {
+		123.45L, 67.8L, 901.23L, 456789.012L, 3.456789L, -4567890.12L
+	};
+	std::vector<double> y1 = {
+		0.45L, 0.8L, 0.23L, 0.012L, 0.456789L, -0.12L
+	};
+	std::vector<double> y2 = {
+		0.45L, 0.8L, 0.23L, 456789.012L, 0.456789L, -4567890.12L
+	};
+	std::vector<double> x2;
+	std::vector<double> x3;
+	std::vector<double> y3;
+	std::vector<double> y4;
+	x2.reserve(1000);
+	x3.reserve(1000);
+	y3.reserve(1000);
+	y4.reserve(1000);
+	for (size_t i = 0; i < 1000; ++i) {
+		x2.push_back(static_cast<double>(i));
+		y3.push_back(1.0);
+		x3.push_back(-(1000.0 - static_cast<double>(i)));
+		y4.push_back(1.0);
+	}
+
+	double maxElement1 = 0.0L;
+	double maxElement2 = 0.0L;
+	double maxElement3 = 0.0L;
+	double maxElement4 = 0.0L;
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y1, maxElement1));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_NonRelative(x, y2, maxElement2));
+
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x2, y3, maxElement3));
+	ASSERT_NO_THROW(basicValueIteration_equalModuloPrecision_double_Relative(x3, y4, maxElement4));
+
+	ASSERT_DOUBLE_EQ(4567890.0L, maxElement1);
+	ASSERT_DOUBLE_EQ(901.0L, maxElement2);
+
+	ASSERT_DOUBLE_EQ(998.0L, maxElement3);
+	ASSERT_DOUBLE_EQ(1001.0L, maxElement4);
+}
+
 #endif