Browse Source
All tests for CUDA based TopologicalValueIterationMdpPrctlModelChecker passing on Windows.
All tests for CUDA based TopologicalValueIterationMdpPrctlModelChecker passing on Windows.
Former-commit-id: 68cafa6f84
tempestpy_adaptions
David_Korzeniewski
10 years ago
27 changed files with 2509 additions and 43 deletions
-
103CMakeLists.txt
-
64cuda/CMakeAlignmentCheck.cpp
-
31cuda/CMakeFloatAlignmentCheck.cpp
-
6cuda/kernels/allCudaKernels.h
-
0cuda/kernels/bandWidth.cu
-
0cuda/kernels/bandWidth.h
-
286cuda/kernels/basicAdd.cu
-
9cuda/kernels/basicAdd.h
-
879cuda/kernels/basicValueIteration.cu
-
119cuda/kernels/basicValueIteration.h
-
19cuda/kernels/cudaForStorm.h
-
49cuda/kernels/cuspExtension.h
-
361cuda/kernels/cuspExtensionDouble.h
-
375cuda/kernels/cuspExtensionFloat.h
-
39cuda/kernels/kernelSwitchTest.cu
-
1cuda/kernels/kernelSwitchTest.h
-
33cuda/kernels/utility.cu
-
12cuda/kernels/utility.h
-
28cuda/kernels/version.cu
-
16cuda/kernels/version.h
-
21cuda/storm-cudaplugin-config.h.in
-
2src/solver/NativeNondeterministicLinearEquationSolver.cpp
-
24src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.cpp
-
38src/solver/TopologicalValueIterationNondeterministicLinearEquationSolver.h
-
9src/storage/SparseMatrix.cpp
-
12src/storage/SparseMatrix.h
-
16test/functional/modelchecker/TopologicalValueIterationMdpPrctlModelCheckerTest.cpp
@ -0,0 +1,64 @@ |
|||||
|
/*
|
||||
|
* This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code. |
||||
|
*/ |
||||
|
#include <cstdint>
|
||||
|
#include <utility>
|
||||
|
#include <vector>
|
||||
|
|
||||
|
#define CONTAINER_SIZE 100ul
|
||||
|
|
||||
|
template <typename IndexType, typename ValueType> |
||||
|
int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) { |
||||
|
std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>(); |
||||
|
for (size_t i = 0; i < containerSize; ++i) { |
||||
|
myVector->push_back(std::make_pair(firstValue, secondValue)); |
||||
|
} |
||||
|
size_t myVectorSize = myVector->size(); |
||||
|
IndexType* firstStart = &(myVector->at(0).first); |
||||
|
IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first); |
||||
|
ValueType* secondStart = &(myVector->at(0).second); |
||||
|
ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second); |
||||
|
size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart); |
||||
|
size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd); |
||||
|
size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart); |
||||
|
size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart); |
||||
|
|
||||
|
delete myVector; |
||||
|
myVector = nullptr; |
||||
|
|
||||
|
if (myVectorSize != containerSize) { |
||||
|
return -2; |
||||
|
} |
||||
|
|
||||
|
// Check for alignment:
|
||||
|
// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
|
||||
|
if (sizeof(IndexType) != sizeof(ValueType)) { |
||||
|
return -3; |
||||
|
} |
||||
|
if (startOffset != sizeof(IndexType)) { |
||||
|
return -4; |
||||
|
} |
||||
|
if (endOffset != sizeof(IndexType)) { |
||||
|
return -5; |
||||
|
} |
||||
|
if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { |
||||
|
return -6; |
||||
|
} |
||||
|
if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) { |
||||
|
return -7; |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
int main(int argc, char* argv[]) { |
||||
|
int result = 0; |
||||
|
|
||||
|
result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14); |
||||
|
if (result != 0) { |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
@ -0,0 +1,31 @@ |
|||||
|
/*
|
||||
|
* This is component of StoRM - Cuda Plugin to check whether a pair of uint_fast64_t and float gets auto-aligned to match 64bit boundaries |
||||
|
*/ |
||||
|
#include <cstdint>
|
||||
|
#include <utility>
|
||||
|
#include <vector>
|
||||
|
|
||||
|
#define CONTAINER_SIZE 100ul
|
||||
|
|
||||
|
int main(int argc, char* argv[]) { |
||||
|
int result = 0; |
||||
|
|
||||
|
std::vector<std::pair<uint_fast64_t, float>> myVector; |
||||
|
for (size_t i = 0; i < CONTAINER_SIZE; ++i) { |
||||
|
myVector.push_back(std::make_pair(i, 42.12345f * i)); |
||||
|
} |
||||
|
|
||||
|
char* firstUintPointer = reinterpret_cast<char*>(&(myVector.at(0).first)); |
||||
|
char* secondUintPointer = reinterpret_cast<char*>(&(myVector.at(1).first)); |
||||
|
ptrdiff_t uintDiff = secondUintPointer - firstUintPointer; |
||||
|
|
||||
|
if (uintDiff == (2 * sizeof(uint_fast64_t))) { |
||||
|
result = 2; |
||||
|
} else if (uintDiff == (sizeof(uint_fast64_t) + sizeof(float))) { |
||||
|
result = 3; |
||||
|
} else { |
||||
|
result = -5; |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
@ -0,0 +1,6 @@ |
|||||
|
#include "utility.h" |
||||
|
#include "bandWidth.h" |
||||
|
#include "basicAdd.h" |
||||
|
#include "kernelSwitchTest.h" |
||||
|
#include "basicValueIteration.h" |
||||
|
#include "version.h" |
@ -0,0 +1,286 @@ |
|||||
|
#include <cuda.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <stdio.h> |
||||
|
|
||||
|
#include <chrono> |
||||
|
#include <iostream> |
||||
|
|
||||
|
__global__ void cuda_kernel_basicAdd(int a, int b, int *c) { |
||||
|
*c = a + b; |
||||
|
} |
||||
|
|
||||
|
__global__ void cuda_kernel_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
// Fused Multiply Add: |
||||
|
// A * B + C => D |
||||
|
|
||||
|
/* |
||||
|
*Die Variable i dient für den Zugriff auf das Array. Da jeder Thread die Funktion VecAdd |
||||
|
*ausführt, muss i für jeden Thread unterschiedlich sein. Ansonsten würden unterschiedliche |
||||
|
*Threads auf denselben Index im Array schreiben. blockDim.x ist die Anzahl der Threads der x-Komponente |
||||
|
*des Blocks, blockIdx.x ist die x-Koordinate des aktuellen Blocks und threadIdx.x ist die x-Koordinate des |
||||
|
*Threads, der die Funktion gerade ausführt. |
||||
|
*/ |
||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
|
||||
|
if (i < N) { |
||||
|
D[i] = A[i] * B[i] + C[i]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
__global__ void cuda_kernel_arrayFmaOptimized(int * const A, int const N, int const M) { |
||||
|
// Fused Multiply Add: |
||||
|
// A * B + C => D |
||||
|
|
||||
|
// Layout: |
||||
|
// A B C D A B C D A B C D |
||||
|
|
||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
|
||||
|
if ((i*M) < N) { |
||||
|
for (int j = i*M; j < i*M + M; ++j) { |
||||
|
A[j*4 + 3] = A[j*4] * A[j*4 + 1] + A[j*4 + 2]; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" int cuda_basicAdd(int a, int b) { |
||||
|
int c = 0; |
||||
|
int *dev_c; |
||||
|
cudaMalloc((void**)&dev_c, sizeof(int)); |
||||
|
cuda_kernel_basicAdd<<<1, 1>>>(a, b, dev_c); |
||||
|
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); |
||||
|
//printf("%d + %d + 42 is %d\n", a, b, c); |
||||
|
cudaFree(dev_c); |
||||
|
return c; |
||||
|
} |
||||
|
|
||||
|
void cpp_cuda_bandwidthTest(int entryCount, int N) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = entryCount * sizeof(int); |
||||
|
|
||||
|
int* deviceIntArray; |
||||
|
int* hostIntArray = new int[arraySize]; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
auto copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
double mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Allocating the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second Allocationspeed." << std::endl; |
||||
|
|
||||
|
if (cudaMalloc((void**)&deviceIntArray, arraySize) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << arraySize << " Bytes for copyTest!" << std::endl; |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Prepare data |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
hostIntArray[i] = i * 333 + 123; |
||||
|
} |
||||
|
|
||||
|
// Copy data TO device |
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMemcpy(deviceIntArray, hostIntArray, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to device!" << std::endl; |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second TO device." << std::endl; |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
if (cudaMemcpy(hostIntArray, deviceIntArray, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMemcpy while copying " << arraySize << " Bytes to host!" << std::endl; |
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
end_time = std::chrono::high_resolution_clock::now(); |
||||
|
copyTime = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count(); |
||||
|
mBytesPerSecond = (((double)(N * arraySize)) / copyTime) * 0.95367431640625; |
||||
|
std::cout << "Copying the Array " << N << " times took " << copyTime << " Microseconds." << std::endl; |
||||
|
std::cout << "Resulting in " << mBytesPerSecond << " MBytes per Second FROM device." << std::endl; |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArray) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
} |
||||
|
delete[] hostIntArray; |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = N * sizeof(int); |
||||
|
|
||||
|
int* deviceIntArrayA; |
||||
|
int* deviceIntArrayB; |
||||
|
int* deviceIntArrayC; |
||||
|
int* deviceIntArrayD; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc1!\n"); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayB, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc2!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayC, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc3!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntArrayD, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc4!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Copy data TO device |
||||
|
if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMemcpy(deviceIntArrayB, B, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMemcpy(deviceIntArrayC, C, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Festlegung der Threads pro Block |
||||
|
int threadsPerBlock = 512; |
||||
|
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
||||
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
||||
|
|
||||
|
// Run kernel |
||||
|
cuda_kernel_arrayFma<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, deviceIntArrayB, deviceIntArrayC, deviceIntArrayD, N); |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
if (cudaMemcpy(D, deviceIntArrayD, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Free memory on device |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
cudaFree(deviceIntArrayB); |
||||
|
cudaFree(deviceIntArrayC); |
||||
|
cudaFree(deviceIntArrayD); |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M) { |
||||
|
// Size of the Arrays |
||||
|
size_t arraySize = N * sizeof(int) * 4; |
||||
|
|
||||
|
int* deviceIntArrayA; |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
if (cudaMalloc((void**)&deviceIntArrayA, arraySize) != cudaSuccess) { |
||||
|
printf("Error in cudaMalloc1!\n"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
#define ONFAILFREE0() do { } while(0) |
||||
|
#define ONFAILFREE1(a) do { cudaFree(a); } while(0) |
||||
|
#define ONFAILFREE2(a, b) do { cudaFree(a); cudaFree(b); } while(0) |
||||
|
#define ONFAILFREE3(a, b, c) do { cudaFree(a); cudaFree(b); cudaFree(c); } while(0) |
||||
|
#define ONFAILFREE4(a, b, c, d) do { cudaFree(a); cudaFree(b); cudaFree(c); cudaFree(d); } while(0) |
||||
|
#define CHECKED_CUDA_CALL(func__, freeArgs, ...) do { int retCode = cuda##func__ (__VA_ARGS__); if (retCode != cudaSuccess) { freeArgs; printf("Error in func__!\n"); return; } } while(0) |
||||
|
|
||||
|
// Copy data TO device |
||||
|
|
||||
|
CHECKED_CUDA_CALL(Memcpy, ONFAILFREE1(deviceIntArrayA), deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice); |
||||
|
|
||||
|
/*if (cudaMemcpy(deviceIntArrayA, A, arraySize, cudaMemcpyHostToDevice) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
}*/ |
||||
|
|
||||
|
// Festlegung der Threads pro Block |
||||
|
int threadsPerBlock = 512; |
||||
|
// Es werden soviele Blöcke benötigt, dass alle Elemente der Vektoren abgearbeitet werden können |
||||
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
||||
|
|
||||
|
// Run kernel |
||||
|
cuda_kernel_arrayFmaOptimized<<<blocksPerGrid, threadsPerBlock>>>(deviceIntArrayA, N, M); |
||||
|
|
||||
|
// Copy data FROM device |
||||
|
if (cudaMemcpy(A, deviceIntArrayA, arraySize, cudaMemcpyDeviceToHost) != cudaSuccess) { |
||||
|
printf("Error in cudaMemcpy!\n"); |
||||
|
cudaFree(deviceIntArrayA); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntArrayA) != cudaSuccess) { |
||||
|
printf("Error in cudaFree!\n"); |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N) { |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
D[i] = A[i] * B[i] + C[i]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N) { |
||||
|
for (int i = 0; i < N; i += 4) { |
||||
|
A[i+3] = A[i] * A[i+1] + A[i+2]; |
||||
|
} |
||||
|
} |
@ -0,0 +1,9 @@ |
|||||
|
extern "C" int cuda_basicAdd(int a, int b); |
||||
|
|
||||
|
extern "C" void cuda_arrayFmaOptimized(int * const A, int const N, int const M); |
||||
|
extern "C" void cuda_arrayFmaOptimizedHelper(int * const A, int const N); |
||||
|
|
||||
|
extern "C" void cuda_arrayFma(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
||||
|
extern "C" void cuda_arrayFmaHelper(int const * const A, int const * const B, int const * const C, int * const D, int const N); |
||||
|
|
||||
|
void cpp_cuda_bandwidthTest(int entryCount, int N); |
@ -0,0 +1,879 @@ |
|||||
|
#include "basicValueIteration.h" |
||||
|
#define CUSP_USE_TEXTURE_MEMORY |
||||
|
|
||||
|
#include <iostream> |
||||
|
#include <chrono> |
||||
|
|
||||
|
#include <cuda_runtime.h> |
||||
|
#include "cusparse_v2.h" |
||||
|
|
||||
|
#include "utility.h" |
||||
|
|
||||
|
#include "cuspExtension.h" |
||||
|
|
||||
|
#include <thrust/transform.h> |
||||
|
#include <thrust/device_ptr.h> |
||||
|
#include <thrust/functional.h> |
||||
|
|
||||
|
#include "storm-cudaplugin-config.h" |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
#define CUDA_CHECK_ALL_ERRORS() do { cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) { std::cout << "(DLL) Sync kernel error: " << cudaGetErrorString(errSync) << " (Code: " << errSync << ") in Line " << __LINE__ << std::endl; } if (errAsync != cudaSuccess) { std::cout << "(DLL) Async kernel error: " << cudaGetErrorString(errAsync) << " (Code: " << errAsync << ") in Line " << __LINE__ << std::endl; } } while(false) |
||||
|
#else |
||||
|
#define CUDA_CHECK_ALL_ERRORS() do {} while (false) |
||||
|
#endif |
||||
|
|
||||
|
template<typename T, bool Relative> |
||||
|
struct equalModuloPrecision : public thrust::binary_function<T,T,T> |
||||
|
{ |
||||
|
__host__ __device__ T operator()(const T &x, const T &y) const |
||||
|
{ |
||||
|
if (Relative) { |
||||
|
if (y == 0) { |
||||
|
return ((x >= 0) ? (x) : (-x)); |
||||
|
} |
||||
|
const T result = (x - y) / y; |
||||
|
return ((result >= 0) ? (result) : (-result)); |
||||
|
} else { |
||||
|
const T result = (x - y); |
||||
|
return ((result >= 0) ? (result) : (-result)); |
||||
|
} |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
template<typename IndexType, typename ValueType> |
||||
|
void exploadVector(std::vector<std::pair<IndexType, ValueType>> const& inputVector, std::vector<IndexType>& indexVector, std::vector<ValueType>& valueVector) { |
||||
|
indexVector.reserve(inputVector.size()); |
||||
|
valueVector.reserve(inputVector.size()); |
||||
|
for (size_t i = 0; i < inputVector.size(); ++i) { |
||||
|
indexVector.push_back(inputVector.at(i).first); |
||||
|
valueVector.push_back(inputVector.at(i).second); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// TEMPLATE VERSION |
||||
|
template <bool Minimize, bool Relative, typename IndexType, typename ValueType> |
||||
|
bool basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, double const precision, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
||||
|
//std::vector<IndexType> matrixColumnIndices; |
||||
|
//std::vector<ValueType> matrixValues; |
||||
|
//exploadVector<IndexType, ValueType>(columnIndicesAndValues, matrixColumnIndices, matrixValues); |
||||
|
bool errorOccured = false; |
||||
|
|
||||
|
IndexType* device_matrixRowIndices = nullptr; |
||||
|
ValueType* device_matrixColIndicesAndValues = nullptr; |
||||
|
ValueType* device_x = nullptr; |
||||
|
ValueType* device_xSwap = nullptr; |
||||
|
ValueType* device_b = nullptr; |
||||
|
ValueType* device_multiplyResult = nullptr; |
||||
|
IndexType* device_nondeterministicChoiceIndices = nullptr; |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout.sync_with_stdio(true); |
||||
|
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_mvReduce" << std::endl; |
||||
|
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory())) * 100 << "%)." << std::endl; |
||||
|
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size() + sizeof(ValueType) * b.size() + sizeof(IndexType) * nondeterministicChoiceIndices.size(); |
||||
|
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
const IndexType matrixRowCount = matrixRowIndices.size() - 1; |
||||
|
const IndexType matrixColCount = nondeterministicChoiceIndices.size() - 1; |
||||
|
const IndexType matrixNnzCount = columnIndicesAndValues.size(); |
||||
|
|
||||
|
cudaError_t cudaMallocResult; |
||||
|
|
||||
|
bool converged = false; |
||||
|
iterationCount = 0; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE true |
||||
|
#else |
||||
|
#define STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE false |
||||
|
#endif |
||||
|
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
} else { |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_xSwap), sizeof(ValueType) * matrixColCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector x swap, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * matrixRowCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_nondeterministicChoiceIndices), sizeof(IndexType) * (matrixColCount + 1)); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished allocating memory." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// Memory allocated, copy data to device |
||||
|
cudaError_t cudaCopyResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Copy all data as floats are expanded to 64bits :/ |
||||
|
if (sizeof(ValueType) == sizeof(float) && STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT_VALUE) { |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
} else { |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Preset the xSwap to zeros... |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemset(device_xSwap, 0, sizeof(ValueType) * matrixColCount); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not zero the Swap Vector x, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * matrixRowCount, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Preset the multiplyResult to zeros... |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * (matrixColCount + 1), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// Data is on device, start Kernel |
||||
|
while (!converged && iterationCount < maxIterationCount) { // In a sub-area since transfer of control via label evades initialization |
||||
|
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b); |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_multiplyResult(device_multiplyResult); |
||||
|
|
||||
|
// Transform: Add multiplyResult + b inplace to multiplyResult |
||||
|
thrust::transform(devicePtrThrust_multiplyResult, devicePtrThrust_multiplyResult + matrixRowCount, devicePtrThrust_b, devicePtrThrust_multiplyResult, thrust::plus<ValueType>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
// Reduce: Reduce multiplyResult to a new x vector |
||||
|
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(matrixColCount, matrixRowCount, device_nondeterministicChoiceIndices, device_xSwap, device_multiplyResult); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
// Check for convergence |
||||
|
// Transform: x = abs(x - xSwap)/ xSwap |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x); |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_x_end(device_x + matrixColCount); |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_xSwap(device_xSwap); |
||||
|
thrust::transform(devicePtrThrust_x, devicePtrThrust_x_end, devicePtrThrust_xSwap, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
// Reduce: get Max over x and check for res < Precision |
||||
|
ValueType maxX = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x_end, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
converged = (maxX < precision); |
||||
|
++iterationCount; |
||||
|
|
||||
|
// Swap pointers, device_x always contains the most current result |
||||
|
std::swap(device_x, device_xSwap); |
||||
|
} |
||||
|
|
||||
|
if (!converged && (iterationCount == maxIterationCount)) { |
||||
|
iterationCount = 0; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished kernel execution." << std::endl; |
||||
|
std::cout << "(DLL) Executed " << iterationCount << " of max. " << maxIterationCount << " Iterations." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// Get x back from the device |
||||
|
cudaCopyResult = cudaMemcpy(x.data(), device_x, sizeof(ValueType) * matrixColCount, cudaMemcpyDeviceToHost); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy back data for result vector x, Error Code " << cudaCopyResult << std::endl; |
||||
|
errorOccured = true; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished copying result data." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// All code related to freeing memory and clearing up the device |
||||
|
cleanup: |
||||
|
if (device_matrixRowIndices != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_matrixRowIndices = nullptr; |
||||
|
} |
||||
|
if (device_matrixColIndicesAndValues != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_matrixColIndicesAndValues = nullptr; |
||||
|
} |
||||
|
if (device_x != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_x); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_x = nullptr; |
||||
|
} |
||||
|
if (device_xSwap != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_xSwap); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector x swap, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_xSwap = nullptr; |
||||
|
} |
||||
|
if (device_b != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_b); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_b = nullptr; |
||||
|
} |
||||
|
if (device_multiplyResult != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_multiplyResult = nullptr; |
||||
|
} |
||||
|
if (device_nondeterministicChoiceIndices != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_nondeterministicChoiceIndices); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Nondeterministic Choice Indices, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
errorOccured = true; |
||||
|
} |
||||
|
device_nondeterministicChoiceIndices = nullptr; |
||||
|
} |
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished cleanup." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
return !errorOccured; |
||||
|
} |
||||
|
|
||||
|
template <typename IndexType, typename ValueType> |
||||
|
void basicValueIteration_spmv(uint_fast64_t const matrixColCount, std::vector<IndexType> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, ValueType>> const& columnIndicesAndValues, std::vector<ValueType> const& x, std::vector<ValueType>& b) { |
||||
|
IndexType* device_matrixRowIndices = nullptr; |
||||
|
ValueType* device_matrixColIndicesAndValues = nullptr; |
||||
|
ValueType* device_x = nullptr; |
||||
|
ValueType* device_multiplyResult = nullptr; |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout.sync_with_stdio(true); |
||||
|
std::cout << "(DLL) Entering CUDA Function: basicValueIteration_spmv" << std::endl; |
||||
|
std::cout << "(DLL) Device has " << getTotalCudaMemory() << " Bytes of Memory with " << getFreeCudaMemory() << "Bytes free (" << (static_cast<double>(getFreeCudaMemory()) / static_cast<double>(getTotalCudaMemory()))*100 << "%)." << std::endl; |
||||
|
size_t memSize = sizeof(IndexType) * matrixRowIndices.size() + sizeof(IndexType) * columnIndicesAndValues.size() * 2 + sizeof(ValueType) * x.size() + sizeof(ValueType) * b.size(); |
||||
|
std::cout << "(DLL) We will allocate " << memSize << " Bytes." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
const IndexType matrixRowCount = matrixRowIndices.size() - 1; |
||||
|
const IndexType matrixNnzCount = columnIndicesAndValues.size(); |
||||
|
|
||||
|
cudaError_t cudaMallocResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixRowIndices), sizeof(IndexType) * (matrixRowCount + 1)); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(IndexType) * matrixNnzCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
#else |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_matrixColIndicesAndValues), sizeof(IndexType) * matrixNnzCount + sizeof(ValueType) * matrixNnzCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Matrix Column Indices And Values, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
#endif |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * matrixColCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_multiplyResult), sizeof(ValueType) * matrixRowCount); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished allocating memory." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// Memory allocated, copy data to device |
||||
|
cudaError_t cudaCopyResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * (matrixRowCount + 1), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(IndexType) * matrixNnzCount), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
#else |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * matrixNnzCount) + (sizeof(ValueType) * matrixNnzCount), cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
#endif |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * matrixColCount, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Preset the multiplyResult to zeros... |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemset(device_multiplyResult, 0, sizeof(ValueType) * matrixRowCount); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not zero the multiply Result, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished copying data to GPU memory." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
cusp::detail::device::storm_cuda_opt_spmv_csr_vector<ValueType>(matrixRowCount, matrixNnzCount, device_matrixRowIndices, device_matrixColIndicesAndValues, device_x, device_multiplyResult); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished kernel execution." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// Get result back from the device |
||||
|
cudaCopyResult = cudaMemcpy(b.data(), device_multiplyResult, sizeof(ValueType) * matrixRowCount, cudaMemcpyDeviceToHost); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished copying result data." << std::endl; |
||||
|
#endif |
||||
|
|
||||
|
// All code related to freeing memory and clearing up the device |
||||
|
cleanup: |
||||
|
if (device_matrixRowIndices != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_matrixRowIndices); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Matrix Row Indices, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_matrixRowIndices = nullptr; |
||||
|
} |
||||
|
if (device_matrixColIndicesAndValues != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_matrixColIndicesAndValues); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Matrix Column Indices and Values, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_matrixColIndicesAndValues = nullptr; |
||||
|
} |
||||
|
if (device_x != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_x); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_x = nullptr; |
||||
|
} |
||||
|
if (device_multiplyResult != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_multiplyResult); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector multiplyResult, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_multiplyResult = nullptr; |
||||
|
} |
||||
|
#ifdef DEBUG |
||||
|
std::cout << "(DLL) Finished cleanup." << std::endl; |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
template <typename ValueType> |
||||
|
void basicValueIteration_addVectorsInplace(std::vector<ValueType>& a, std::vector<ValueType> const& b) { |
||||
|
ValueType* device_a = nullptr; |
||||
|
ValueType* device_b = nullptr; |
||||
|
|
||||
|
const size_t vectorSize = std::max(a.size(), b.size()); |
||||
|
|
||||
|
cudaError_t cudaMallocResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_a), sizeof(ValueType) * vectorSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector a, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_b), sizeof(ValueType) * vectorSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Memory allocated, copy data to device |
||||
|
cudaError_t cudaCopyResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_a, a.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector a, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
do { |
||||
|
// Transform: Add multiplyResult + b inplace to multiplyResult |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_a(device_a); |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_b(device_b); |
||||
|
thrust::transform(devicePtrThrust_a, devicePtrThrust_a + vectorSize, devicePtrThrust_b, devicePtrThrust_a, thrust::plus<ValueType>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
} while (false); |
||||
|
|
||||
|
// Get result back from the device |
||||
|
cudaCopyResult = cudaMemcpy(a.data(), device_a, sizeof(ValueType) * vectorSize, cudaMemcpyDeviceToHost); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// All code related to freeing memory and clearing up the device |
||||
|
cleanup: |
||||
|
if (device_a != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_a); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector a, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_a = nullptr; |
||||
|
} |
||||
|
if (device_b != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_b); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector b, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_b = nullptr; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <typename IndexType, typename ValueType, bool Minimize> |
||||
|
void basicValueIteration_reduceGroupedVector(std::vector<ValueType> const& groupedVector, std::vector<IndexType> const& grouping, std::vector<ValueType>& targetVector) { |
||||
|
ValueType* device_groupedVector = nullptr; |
||||
|
IndexType* device_grouping = nullptr; |
||||
|
ValueType* device_target = nullptr; |
||||
|
|
||||
|
const size_t groupedSize = groupedVector.size(); |
||||
|
const size_t groupingSize = grouping.size(); |
||||
|
const size_t targetSize = targetVector.size(); |
||||
|
|
||||
|
cudaError_t cudaMallocResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_groupedVector), sizeof(ValueType) * groupedSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector groupedVector, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_grouping), sizeof(IndexType) * groupingSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector grouping, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_target), sizeof(ValueType) * targetSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector targetVector, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Memory allocated, copy data to device |
||||
|
cudaError_t cudaCopyResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_groupedVector, groupedVector.data(), sizeof(ValueType) * groupedSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector groupedVector, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_grouping, grouping.data(), sizeof(IndexType) * groupingSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector grouping, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
do { |
||||
|
// Reduce: Reduce multiplyResult to a new x vector |
||||
|
cusp::detail::device::storm_cuda_opt_vector_reduce<Minimize, ValueType>(groupingSize - 1, groupedSize, device_grouping, device_target, device_groupedVector); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
} while (false); |
||||
|
|
||||
|
// Get result back from the device |
||||
|
cudaCopyResult = cudaMemcpy(targetVector.data(), device_target, sizeof(ValueType) * targetSize, cudaMemcpyDeviceToHost); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy back data for result vector, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// All code related to freeing memory and clearing up the device |
||||
|
cleanup: |
||||
|
if (device_groupedVector != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_groupedVector); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector groupedVector, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_groupedVector = nullptr; |
||||
|
} |
||||
|
if (device_grouping != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_grouping); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector grouping, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_grouping = nullptr; |
||||
|
} |
||||
|
if (device_target != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_target); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector target, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_target = nullptr; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <typename ValueType, bool Relative> |
||||
|
void basicValueIteration_equalModuloPrecision(std::vector<ValueType> const& x, std::vector<ValueType> const& y, ValueType& maxElement) { |
||||
|
ValueType* device_x = nullptr; |
||||
|
ValueType* device_y = nullptr; |
||||
|
|
||||
|
const size_t vectorSize = x.size(); |
||||
|
|
||||
|
cudaError_t cudaMallocResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_x), sizeof(ValueType) * vectorSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaMallocResult = cudaMalloc(reinterpret_cast<void**>(&device_y), sizeof(ValueType) * vectorSize); |
||||
|
if (cudaMallocResult != cudaSuccess) { |
||||
|
std::cout << "Could not allocate memory for Vector y, Error Code " << cudaMallocResult << "." << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
// Memory allocated, copy data to device |
||||
|
cudaError_t cudaCopyResult; |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
cudaCopyResult = cudaMemcpy(device_y, y.data(), sizeof(ValueType) * vectorSize, cudaMemcpyHostToDevice); |
||||
|
if (cudaCopyResult != cudaSuccess) { |
||||
|
std::cout << "Could not copy data for Vector y, Error Code " << cudaCopyResult << std::endl; |
||||
|
goto cleanup; |
||||
|
} |
||||
|
|
||||
|
do { |
||||
|
// Transform: x = abs(x - xSwap)/ xSwap |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_x(device_x); |
||||
|
thrust::device_ptr<ValueType> devicePtrThrust_y(device_y); |
||||
|
thrust::transform(devicePtrThrust_x, devicePtrThrust_x + vectorSize, devicePtrThrust_y, devicePtrThrust_x, equalModuloPrecision<ValueType, Relative>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
|
||||
|
// Reduce: get Max over x and check for res < Precision |
||||
|
maxElement = thrust::reduce(devicePtrThrust_x, devicePtrThrust_x + vectorSize, -std::numeric_limits<ValueType>::max(), thrust::maximum<ValueType>()); |
||||
|
CUDA_CHECK_ALL_ERRORS(); |
||||
|
} while (false); |
||||
|
|
||||
|
// All code related to freeing memory and clearing up the device |
||||
|
cleanup: |
||||
|
if (device_x != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_x); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector x, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_x = nullptr; |
||||
|
} |
||||
|
if (device_y != nullptr) { |
||||
|
cudaError_t cudaFreeResult = cudaFree(device_y); |
||||
|
if (cudaFreeResult != cudaSuccess) { |
||||
|
std::cout << "Could not free Memory of Vector y, Error Code " << cudaFreeResult << "." << std::endl; |
||||
|
} |
||||
|
device_y = nullptr; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/* |
||||
|
* Declare and implement all exported functions for these Kernels here |
||||
|
* |
||||
|
*/ |
||||
|
|
||||
|
void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b) { |
||||
|
basicValueIteration_spmv<uint_fast64_t, double>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b) { |
||||
|
basicValueIteration_addVectorsInplace<double>(a, b); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) { |
||||
|
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, true>(groupedVector, grouping, targetVector); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector) { |
||||
|
basicValueIteration_reduceGroupedVector<uint_fast64_t, double, false>(groupedVector, grouping, targetVector); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) { |
||||
|
basicValueIteration_equalModuloPrecision<double, true>(x, y, maxElement); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement) { |
||||
|
basicValueIteration_equalModuloPrecision<double, false>(x, y, maxElement); |
||||
|
} |
||||
|
|
||||
|
// Float |
||||
|
void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b) { |
||||
|
basicValueIteration_spmv<uint_fast64_t, float>(matrixColCount, matrixRowIndices, columnIndicesAndValues, x, b); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b) { |
||||
|
basicValueIteration_addVectorsInplace<float>(a, b); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) { |
||||
|
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, true>(groupedVector, grouping, targetVector); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector) { |
||||
|
basicValueIteration_reduceGroupedVector<uint_fast64_t, float, false>(groupedVector, grouping, targetVector); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) { |
||||
|
basicValueIteration_equalModuloPrecision<float, true>(x, y, maxElement); |
||||
|
} |
||||
|
|
||||
|
void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement) { |
||||
|
basicValueIteration_equalModuloPrecision<float, false>(x, y, maxElement); |
||||
|
} |
||||
|
|
||||
|
bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
||||
|
if (relativePrecisionCheck) { |
||||
|
return basicValueIteration_mvReduce<true, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} else { |
||||
|
return basicValueIteration_mvReduce<true, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
||||
|
if (relativePrecisionCheck) { |
||||
|
return basicValueIteration_mvReduce<false, true, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} else { |
||||
|
return basicValueIteration_mvReduce<false, false, uint_fast64_t, double>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
||||
|
if (relativePrecisionCheck) { |
||||
|
return basicValueIteration_mvReduce<true, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} else { |
||||
|
return basicValueIteration_mvReduce<true, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount) { |
||||
|
if (relativePrecisionCheck) { |
||||
|
return basicValueIteration_mvReduce<false, true, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} else { |
||||
|
return basicValueIteration_mvReduce<false, false, uint_fast64_t, float>(maxIterationCount, precision, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices, iterationCount); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { |
||||
|
size_t const valueTypeSize = sizeof(double); |
||||
|
size_t const indexTypeSize = sizeof(uint_fast64_t); |
||||
|
|
||||
|
/* |
||||
|
IndexType* device_matrixRowIndices = nullptr; |
||||
|
IndexType* device_matrixColIndices = nullptr; |
||||
|
ValueType* device_matrixValues = nullptr; |
||||
|
ValueType* device_x = nullptr; |
||||
|
ValueType* device_xSwap = nullptr; |
||||
|
ValueType* device_b = nullptr; |
||||
|
ValueType* device_multiplyResult = nullptr; |
||||
|
IndexType* device_nondeterministicChoiceIndices = nullptr; |
||||
|
*/ |
||||
|
|
||||
|
// Row Indices, Column Indices, Values, Choice Indices |
||||
|
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); |
||||
|
// Vectors x, xSwap, b, multiplyResult |
||||
|
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); |
||||
|
|
||||
|
return (matrixDataSize + vectorSizes); |
||||
|
} |
||||
|
|
||||
|
size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount) { |
||||
|
size_t const valueTypeSize = sizeof(float); |
||||
|
size_t const indexTypeSize = sizeof(uint_fast64_t); |
||||
|
|
||||
|
/* |
||||
|
IndexType* device_matrixRowIndices = nullptr; |
||||
|
IndexType* device_matrixColIndices = nullptr; |
||||
|
ValueType* device_matrixValues = nullptr; |
||||
|
ValueType* device_x = nullptr; |
||||
|
ValueType* device_xSwap = nullptr; |
||||
|
ValueType* device_b = nullptr; |
||||
|
ValueType* device_multiplyResult = nullptr; |
||||
|
IndexType* device_nondeterministicChoiceIndices = nullptr; |
||||
|
*/ |
||||
|
|
||||
|
// Row Indices, Column Indices, Values, Choice Indices |
||||
|
size_t const matrixDataSize = ((rowCount + 1) * indexTypeSize) + (nnzCount * indexTypeSize) + (nnzCount * valueTypeSize) + ((rowGroupCount + 1) * indexTypeSize); |
||||
|
// Vectors x, xSwap, b, multiplyResult |
||||
|
size_t const vectorSizes = (rowGroupCount * valueTypeSize) + (rowGroupCount * valueTypeSize) + (rowCount * valueTypeSize) + (rowCount * valueTypeSize); |
||||
|
|
||||
|
return (matrixDataSize + vectorSizes); |
||||
|
} |
@ -0,0 +1,119 @@ |
|||||
|
#ifndef STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
||||
|
#define STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <vector> |
||||
|
#include <utility> |
||||
|
|
||||
|
// Library exports |
||||
|
#include "cudaForStorm.h" |
||||
|
|
||||
|
/* Helper declaration to cope with new internal format */ |
||||
|
#ifndef STORM_STORAGE_SPARSEMATRIX_H_ |
||||
|
namespace storm { |
||||
|
namespace storage { |
||||
|
template<typename IndexType, typename ValueType> |
||||
|
class MatrixEntry { |
||||
|
public: |
||||
|
typedef IndexType index_type; |
||||
|
typedef ValueType value_type; |
||||
|
|
||||
|
/*! |
||||
|
* Constructs a matrix entry with the given column and value. |
||||
|
* |
||||
|
* @param column The column of the matrix entry. |
||||
|
* @param value The value of the matrix entry. |
||||
|
*/ |
||||
|
MatrixEntry(index_type column, value_type value); |
||||
|
|
||||
|
/*! |
||||
|
* Move-constructs the matrix entry fro the given column-value pair. |
||||
|
* |
||||
|
* @param pair The column-value pair from which to move-construct the matrix entry. |
||||
|
*/ |
||||
|
MatrixEntry(std::pair<index_type, value_type>&& pair); |
||||
|
|
||||
|
MatrixEntry(); |
||||
|
MatrixEntry(MatrixEntry const& other); |
||||
|
MatrixEntry& operator=(MatrixEntry const& other); |
||||
|
#ifndef WINDOWS |
||||
|
MatrixEntry(MatrixEntry&& other); |
||||
|
MatrixEntry& operator=(MatrixEntry&& other); |
||||
|
#endif |
||||
|
|
||||
|
/*! |
||||
|
* Retrieves the column of the matrix entry. |
||||
|
* |
||||
|
* @return The column of the matrix entry. |
||||
|
*/ |
||||
|
index_type const& getColumn() const; |
||||
|
|
||||
|
/*! |
||||
|
* Sets the column of the current entry. |
||||
|
* |
||||
|
* @param column The column to set for this entry. |
||||
|
*/ |
||||
|
void setColumn(index_type const& column); |
||||
|
|
||||
|
/*! |
||||
|
* Retrieves the value of the matrix entry. |
||||
|
* |
||||
|
* @return The value of the matrix entry. |
||||
|
*/ |
||||
|
value_type const& getValue() const; |
||||
|
|
||||
|
/*! |
||||
|
* Sets the value of the entry in the matrix. |
||||
|
* |
||||
|
* @param value The value that is to be set for this entry. |
||||
|
*/ |
||||
|
void setValue(value_type const& value); |
||||
|
|
||||
|
/*! |
||||
|
* Retrieves a pair of column and value that characterizes this entry. |
||||
|
* |
||||
|
* @return A column-value pair that characterizes this entry. |
||||
|
*/ |
||||
|
std::pair<index_type, value_type> const& getColumnValuePair() const; |
||||
|
|
||||
|
/*! |
||||
|
* Multiplies the entry with the given factor and returns the result. |
||||
|
* |
||||
|
* @param factor The factor with which to multiply the entry. |
||||
|
*/ |
||||
|
MatrixEntry operator*(value_type factor) const; |
||||
|
|
||||
|
template<typename IndexTypePrime, typename ValueTypePrime> |
||||
|
friend std::ostream& operator<<(std::ostream& out, MatrixEntry<IndexTypePrime, ValueTypePrime> const& entry); |
||||
|
private: |
||||
|
// The actual matrix entry. |
||||
|
std::pair<index_type, value_type> entry; |
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
} |
||||
|
#endif |
||||
|
|
||||
|
size_t basicValueIteration_mvReduce_uint64_double_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); |
||||
|
bool basicValueIteration_mvReduce_uint64_double_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
||||
|
bool basicValueIteration_mvReduce_uint64_double_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
||||
|
|
||||
|
size_t basicValueIteration_mvReduce_uint64_float_calculateMemorySize(size_t const rowCount, size_t const rowGroupCount, size_t const nnzCount); |
||||
|
bool basicValueIteration_mvReduce_uint64_float_minimize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
||||
|
bool basicValueIteration_mvReduce_uint64_float_maximize(uint_fast64_t const maxIterationCount, double const precision, bool const relativePrecisionCheck, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float>& x, std::vector<float> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices, size_t& iterationCount); |
||||
|
|
||||
|
void basicValueIteration_spmv_uint64_double(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, double>> const& columnIndicesAndValues, std::vector<double> const& x, std::vector<double>& b); |
||||
|
void basicValueIteration_addVectorsInplace_double(std::vector<double>& a, std::vector<double> const& b); |
||||
|
void basicValueIteration_reduceGroupedVector_uint64_double_minimize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector); |
||||
|
void basicValueIteration_reduceGroupedVector_uint64_double_maximize(std::vector<double> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<double>& targetVector); |
||||
|
void basicValueIteration_equalModuloPrecision_double_Relative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement); |
||||
|
void basicValueIteration_equalModuloPrecision_double_NonRelative(std::vector<double> const& x, std::vector<double> const& y, double& maxElement); |
||||
|
|
||||
|
void basicValueIteration_spmv_uint64_float(uint_fast64_t const matrixColCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<storm::storage::MatrixEntry<uint_fast64_t, float>> const& columnIndicesAndValues, std::vector<float> const& x, std::vector<float>& b); |
||||
|
void basicValueIteration_addVectorsInplace_float(std::vector<float>& a, std::vector<float> const& b); |
||||
|
void basicValueIteration_reduceGroupedVector_uint64_float_minimize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector); |
||||
|
void basicValueIteration_reduceGroupedVector_uint64_float_maximize(std::vector<float> const& groupedVector, std::vector<uint_fast64_t> const& grouping, std::vector<float>& targetVector); |
||||
|
void basicValueIteration_equalModuloPrecision_float_Relative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement); |
||||
|
void basicValueIteration_equalModuloPrecision_float_NonRelative(std::vector<float> const& x, std::vector<float> const& y, float& maxElement); |
||||
|
|
||||
|
#endif // STORM_CUDAFORSTORM_BASICVALUEITERATION_H_ |
@ -0,0 +1,19 @@ |
|||||
|
#ifndef STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
||||
|
#define STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
||||
|
|
||||
|
/* |
||||
|
* List of exported functions in this library |
||||
|
*/ |
||||
|
|
||||
|
// TopologicalValueIteration |
||||
|
#include "basicValueIteration.h" |
||||
|
|
||||
|
// Utility Functions |
||||
|
#include "utility.h" |
||||
|
|
||||
|
// Version Information |
||||
|
#include "version.h" |
||||
|
|
||||
|
|
||||
|
|
||||
|
#endif // STORM_CUDAFORSTORM_CUDAFORSTORM_H_ |
@ -0,0 +1,49 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include "cuspExtensionFloat.h" |
||||
|
#include "cuspExtensionDouble.h" |
||||
|
|
||||
|
namespace cusp { |
||||
|
namespace detail { |
||||
|
namespace device { |
||||
|
|
||||
|
template <typename ValueType> |
||||
|
void storm_cuda_opt_spmv_csr_vector(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const ValueType * matrixColumnIndicesAndValues, const ValueType* x, ValueType* y) { |
||||
|
// |
||||
|
throw; |
||||
|
} |
||||
|
template <> |
||||
|
void storm_cuda_opt_spmv_csr_vector<double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) { |
||||
|
storm_cuda_opt_spmv_csr_vector_double(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
template <> |
||||
|
void storm_cuda_opt_spmv_csr_vector<float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) { |
||||
|
storm_cuda_opt_spmv_csr_vector_float(num_rows, num_entries, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
|
||||
|
template <bool Minimize, typename ValueType> |
||||
|
void storm_cuda_opt_vector_reduce(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, ValueType * x, const ValueType * y) { |
||||
|
// |
||||
|
throw; |
||||
|
} |
||||
|
template <> |
||||
|
void storm_cuda_opt_vector_reduce<true, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { |
||||
|
storm_cuda_opt_vector_reduce_double<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
template <> |
||||
|
void storm_cuda_opt_vector_reduce<false, double>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) { |
||||
|
storm_cuda_opt_vector_reduce_double<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
|
||||
|
template <> |
||||
|
void storm_cuda_opt_vector_reduce<true, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { |
||||
|
storm_cuda_opt_vector_reduce_float<true>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
template <> |
||||
|
void storm_cuda_opt_vector_reduce<false, float>(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) { |
||||
|
storm_cuda_opt_vector_reduce_float<false>(num_rows, num_entries, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
|
||||
|
} // end namespace device |
||||
|
} // end namespace detail |
||||
|
} // end namespace cusp |
@ -0,0 +1,361 @@ |
|||||
|
/* |
||||
|
* This is an extension of the original CUSP csr_vector.h SPMV implementation. |
||||
|
* It is based on the Code and incorporates changes as to cope with the details |
||||
|
* of the StoRM code. |
||||
|
* Changes have been made for 1) different input format, 2) the sum calculation and 3) the group-reduce algorithm |
||||
|
*/ |
||||
|
|
||||
|
/* |
||||
|
* Copyright 2008-2009 NVIDIA Corporation |
||||
|
* |
||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
* you may not use this file except in compliance with the License. |
||||
|
* You may obtain a copy of the License at |
||||
|
* |
||||
|
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
* |
||||
|
* Unless required by applicable law or agreed to in writing, software |
||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
* See the License for the specific language governing permissions and |
||||
|
* limitations under the License. |
||||
|
*/ |
||||
|
|
||||
|
|
||||
|
#pragma once |
||||
|
|
||||
|
#include <limits> |
||||
|
#include <cstdint> |
||||
|
#include <algorithm> |
||||
|
|
||||
|
#include <math_functions.h> |
||||
|
|
||||
|
#include <cusp/detail/device/spmv/csr_vector.h> |
||||
|
|
||||
|
namespace cusp |
||||
|
{ |
||||
|
namespace detail |
||||
|
{ |
||||
|
namespace device |
||||
|
{ |
||||
|
|
||||
|
////////////////////////////////////////////////////////////////////////////// |
||||
|
// CSR SpMV kernels based on a vector model (one warp per row) |
||||
|
////////////////////////////////////////////////////////////////////////////// |
||||
|
// |
||||
|
// spmv_csr_vector_device |
||||
|
// Each row of the CSR matrix is assigned to a warp. The warp computes |
||||
|
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with |
||||
|
// the x vector, in parallel. This division of work implies that |
||||
|
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous |
||||
|
// manner (but generally not aligned). On GT200 these accesses are |
||||
|
// coalesced, unlike kernels based on the one-row-per-thread division of |
||||
|
// work. Since an entire 32-thread warp is assigned to each row, many |
||||
|
// threads will remain idle when their row contains a small number |
||||
|
// of elements. This code relies on implicit synchronization among |
||||
|
// threads in a warp. |
||||
|
// |
||||
|
// spmv_csr_vector_tex_device |
||||
|
// Same as spmv_csr_vector_tex_device, except that the texture cache is |
||||
|
// used for accessing the x vector. |
||||
|
// |
||||
|
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] |
||||
|
|
||||
|
|
||||
|
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache> |
||||
|
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) |
||||
|
__global__ void |
||||
|
storm_cuda_opt_spmv_csr_vector_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ matrixRowIndices, const double * __restrict__ matrixColumnIndicesAndValues, const double * __restrict__ x, double * __restrict__ y) |
||||
|
{ |
||||
|
__shared__ volatile double sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals |
||||
|
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; |
||||
|
|
||||
|
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; |
||||
|
|
||||
|
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
||||
|
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector |
||||
|
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index |
||||
|
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block |
||||
|
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors |
||||
|
|
||||
|
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
||||
|
{ |
||||
|
// use two threads to fetch Ap[row] and Ap[row+1] |
||||
|
// this is considerably faster than the straightforward version |
||||
|
if(thread_lane < 2) |
||||
|
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; |
||||
|
|
||||
|
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
||||
|
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
||||
|
|
||||
|
// initialize local sum |
||||
|
double sum = 0; |
||||
|
|
||||
|
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) |
||||
|
{ |
||||
|
// ensure aligned memory access to Aj and Ax |
||||
|
|
||||
|
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; |
||||
|
|
||||
|
// accumulate local sums |
||||
|
if(jj >= row_start && jj < row_end) { |
||||
|
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
} |
||||
|
|
||||
|
// accumulate local sums |
||||
|
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
||||
|
} |
||||
|
} else { |
||||
|
// accumulate local sums |
||||
|
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
sum += matrixColumnIndicesAndValues[2 * jj + 1] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 2 * jj), x); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// store local sum in shared memory |
||||
|
sdata[threadIdx.x] = sum; |
||||
|
|
||||
|
// reduce local sums to row sum |
||||
|
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; |
||||
|
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; |
||||
|
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; |
||||
|
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; |
||||
|
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; |
||||
|
|
||||
|
// first thread writes the result |
||||
|
if (thread_lane == 0) |
||||
|
y[row] = sdata[threadIdx.x]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize> |
||||
|
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) |
||||
|
__global__ void |
||||
|
storm_cuda_opt_vector_reduce_kernel_double(const uint_fast64_t num_rows, const uint_fast64_t * __restrict__ nondeterministicChoiceIndices, double * __restrict__ x, const double * __restrict__ y, const double minMaxInitializer) |
||||
|
{ |
||||
|
__shared__ volatile double sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals |
||||
|
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; |
||||
|
|
||||
|
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; |
||||
|
|
||||
|
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
||||
|
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector |
||||
|
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index |
||||
|
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block |
||||
|
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors |
||||
|
|
||||
|
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
||||
|
{ |
||||
|
// use two threads to fetch Ap[row] and Ap[row+1] |
||||
|
// this is considerably faster than the straightforward version |
||||
|
if(thread_lane < 2) |
||||
|
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; |
||||
|
|
||||
|
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
||||
|
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
||||
|
|
||||
|
// initialize local Min/Max |
||||
|
double localMinMaxElement = minMaxInitializer; |
||||
|
|
||||
|
if (THREADS_PER_ROW == 32 && row_end - row_start > 32) |
||||
|
{ |
||||
|
// ensure aligned memory access to Aj and Ax |
||||
|
|
||||
|
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; |
||||
|
|
||||
|
// accumulate local sums |
||||
|
if(jj >= row_start && jj < row_end) { |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// accumulate local sums |
||||
|
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
// accumulate local sums |
||||
|
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// store local sum in shared memory |
||||
|
sdata[threadIdx.x] = localMinMaxElement; |
||||
|
|
||||
|
// reduce local min/max to row min/max |
||||
|
if (Minimize) { |
||||
|
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
||||
|
|
||||
|
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); |
||||
|
} else { |
||||
|
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
||||
|
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); |
||||
|
} |
||||
|
|
||||
|
// first thread writes the result |
||||
|
if (thread_lane == 0) |
||||
|
x[row] = sdata[threadIdx.x]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <bool Minimize, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) |
||||
|
{ |
||||
|
double __minMaxInitializer = -std::numeric_limits<double>::max(); |
||||
|
if (Minimize) { |
||||
|
__minMaxInitializer = std::numeric_limits<double>::max(); |
||||
|
} |
||||
|
const double minMaxInitializer = __minMaxInitializer; |
||||
|
|
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
storm_cuda_opt_vector_reduce_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); |
||||
|
} |
||||
|
|
||||
|
template <bool Minimize> |
||||
|
void storm_cuda_opt_vector_reduce_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, double * x, const double * y) |
||||
|
{ |
||||
|
const uint_fast64_t rows_per_group = num_entries / num_rows; |
||||
|
|
||||
|
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_double<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_double<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_double<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_double<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_vector_reduce_double<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
|
||||
|
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
||||
|
{ |
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
if (UseCache) |
||||
|
bind_x(x); |
||||
|
|
||||
|
storm_cuda_opt_spmv_csr_vector_kernel_double<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
|
||||
|
if (UseCache) |
||||
|
unbind_x(x); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_opt_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const double * matrixColumnIndicesAndValues, const double* x, double* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
|
||||
|
// NON-OPT |
||||
|
|
||||
|
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
||||
|
{ |
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
if (UseCache) |
||||
|
bind_x(x); |
||||
|
|
||||
|
spmv_csr_vector_kernel<uint_fast64_t, double, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
|
||||
|
if (UseCache) |
||||
|
unbind_x(x); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_spmv_csr_vector_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_spmv_csr_vector_double<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_spmv_csr_vector_tex_double(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const double * matrixValues, const double* x, double* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_double<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_double<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_double<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_double<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_spmv_csr_vector_double<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
} |
||||
|
|
||||
|
} // end namespace device |
||||
|
} // end namespace detail |
||||
|
} // end namespace cusp |
@ -0,0 +1,375 @@ |
|||||
|
/* |
||||
|
* This is an extension of the original CUSP csr_vector.h SPMV implementation. |
||||
|
* It is based on the Code and incorporates changes as to cope with the details |
||||
|
* of the StoRM code. |
||||
|
* As this is mostly copy & paste, the original license still applies. |
||||
|
*/ |
||||
|
|
||||
|
/* |
||||
|
* Copyright 2008-2009 NVIDIA Corporation |
||||
|
* |
||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
* you may not use this file except in compliance with the License. |
||||
|
* You may obtain a copy of the License at |
||||
|
* |
||||
|
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
* |
||||
|
* Unless required by applicable law or agreed to in writing, software |
||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
* See the License for the specific language governing permissions and |
||||
|
* limitations under the License. |
||||
|
*/ |
||||
|
|
||||
|
|
||||
|
#pragma once |
||||
|
|
||||
|
#include <limits> |
||||
|
#include <cstdint> |
||||
|
#include <algorithm> |
||||
|
|
||||
|
#include <math_functions.h> |
||||
|
|
||||
|
#include <cusp/detail/device/spmv/csr_vector.h> |
||||
|
|
||||
|
#include "storm-cudaplugin-config.h" |
||||
|
|
||||
|
namespace cusp |
||||
|
{ |
||||
|
namespace detail |
||||
|
{ |
||||
|
namespace device |
||||
|
{ |
||||
|
|
||||
|
////////////////////////////////////////////////////////////////////////////// |
||||
|
// CSR SpMV kernels based on a vector model (one warp per row) |
||||
|
////////////////////////////////////////////////////////////////////////////// |
||||
|
// |
||||
|
// spmv_csr_vector_device |
||||
|
// Each row of the CSR matrix is assigned to a warp. The warp computes |
||||
|
// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with |
||||
|
// the x vector, in parallel. This division of work implies that |
||||
|
// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous |
||||
|
// manner (but generally not aligned). On GT200 these accesses are |
||||
|
// coalesced, unlike kernels based on the one-row-per-thread division of |
||||
|
// work. Since an entire 32-thread warp is assigned to each row, many |
||||
|
// threads will remain idle when their row contains a small number |
||||
|
// of elements. This code relies on implicit synchronization among |
||||
|
// threads in a warp. |
||||
|
// |
||||
|
// spmv_csr_vector_tex_device |
||||
|
// Same as spmv_csr_vector_tex_device, except that the texture cache is |
||||
|
// used for accessing the x vector. |
||||
|
// |
||||
|
// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32] |
||||
|
|
||||
|
|
||||
|
template <unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR, bool UseCache> |
||||
|
__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1) |
||||
|
__global__ void |
||||
|
storm_cuda_opt_spmv_csr_vector_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float * x, float * y) |
||||
|
{ |
||||
|
__shared__ volatile float sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals |
||||
|
__shared__ volatile uint_fast64_t ptrs[VECTORS_PER_BLOCK][2]; |
||||
|
|
||||
|
const uint_fast64_t THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR; |
||||
|
|
||||
|
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
||||
|
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector |
||||
|
const uint_fast64_t vector_id = thread_id / THREADS_PER_VECTOR; // global vector index |
||||
|
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block |
||||
|
const uint_fast64_t num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors |
||||
|
|
||||
|
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
||||
|
{ |
||||
|
// use two threads to fetch Ap[row] and Ap[row+1] |
||||
|
// this is considerably faster than the straightforward version |
||||
|
if(thread_lane < 2) |
||||
|
ptrs[vector_lane][thread_lane] = matrixRowIndices[row + thread_lane]; |
||||
|
|
||||
|
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
||||
|
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
||||
|
|
||||
|
// initialize local sum |
||||
|
float sum = 0; |
||||
|
|
||||
|
if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32) |
||||
|
{ |
||||
|
// ensure aligned memory access to Aj and Ax |
||||
|
|
||||
|
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane; |
||||
|
|
||||
|
// accumulate local sums |
||||
|
if(jj >= row_start && jj < row_end) { |
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
||||
|
#else |
||||
|
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
||||
|
#endif |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
} |
||||
|
|
||||
|
// accumulate local sums |
||||
|
for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR) { |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
||||
|
#else |
||||
|
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
||||
|
#endif |
||||
|
} |
||||
|
} else { |
||||
|
// accumulate local sums |
||||
|
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR) { |
||||
|
//sum += reinterpret_cast<ValueType const*>(matrixColumnIndicesAndValues)[2*jj + 1] * fetch_x<UseCache>(matrixColumnIndicesAndValues[2*jj], x); |
||||
|
#ifdef STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
sum += matrixColumnIndicesAndValues[4 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 4 * jj), x); |
||||
|
#else |
||||
|
sum += matrixColumnIndicesAndValues[3 * jj + 2] * fetch_x<UseCache>(*reinterpret_cast<uint_fast64_t const*>(matrixColumnIndicesAndValues + 3 * jj), x); |
||||
|
#endif |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// store local sum in shared memory |
||||
|
sdata[threadIdx.x] = sum; |
||||
|
|
||||
|
// reduce local sums to row sum |
||||
|
if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16]; |
||||
|
if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8]; |
||||
|
if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4]; |
||||
|
if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2]; |
||||
|
if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1]; |
||||
|
|
||||
|
// first thread writes the result |
||||
|
if (thread_lane == 0) |
||||
|
y[row] = sdata[threadIdx.x]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <unsigned int ROWS_PER_BLOCK, unsigned int THREADS_PER_ROW, bool Minimize> |
||||
|
__launch_bounds__(ROWS_PER_BLOCK * THREADS_PER_ROW,1) |
||||
|
__global__ void |
||||
|
storm_cuda_opt_vector_reduce_kernel_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y, const float minMaxInitializer) |
||||
|
{ |
||||
|
__shared__ volatile float sdata[ROWS_PER_BLOCK * THREADS_PER_ROW + THREADS_PER_ROW / 2]; // padded to avoid reduction conditionals |
||||
|
__shared__ volatile uint_fast64_t ptrs[ROWS_PER_BLOCK][2]; |
||||
|
|
||||
|
const uint_fast64_t THREADS_PER_BLOCK = ROWS_PER_BLOCK * THREADS_PER_ROW; |
||||
|
|
||||
|
const uint_fast64_t thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index |
||||
|
const uint_fast64_t thread_lane = threadIdx.x & (THREADS_PER_ROW - 1); // thread index within the vector |
||||
|
const uint_fast64_t vector_id = thread_id / THREADS_PER_ROW; // global vector index |
||||
|
const uint_fast64_t vector_lane = threadIdx.x / THREADS_PER_ROW; // vector index within the block |
||||
|
const uint_fast64_t num_vectors = ROWS_PER_BLOCK * gridDim.x; // total number of active vectors |
||||
|
|
||||
|
for(uint_fast64_t row = vector_id; row < num_rows; row += num_vectors) |
||||
|
{ |
||||
|
// use two threads to fetch Ap[row] and Ap[row+1] |
||||
|
// this is considerably faster than the straightforward version |
||||
|
if(thread_lane < 2) |
||||
|
ptrs[vector_lane][thread_lane] = nondeterministicChoiceIndices[row + thread_lane]; |
||||
|
|
||||
|
const uint_fast64_t row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row]; |
||||
|
const uint_fast64_t row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1]; |
||||
|
|
||||
|
// initialize local Min/Max |
||||
|
float localMinMaxElement = minMaxInitializer; |
||||
|
|
||||
|
if (THREADS_PER_ROW == 32 && row_end - row_start > 32) |
||||
|
{ |
||||
|
// ensure aligned memory access to Aj and Ax |
||||
|
|
||||
|
uint_fast64_t jj = row_start - (row_start & (THREADS_PER_ROW - 1)) + thread_lane; |
||||
|
|
||||
|
// accumulate local sums |
||||
|
if(jj >= row_start && jj < row_end) { |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// accumulate local sums |
||||
|
for(jj += THREADS_PER_ROW; jj < row_end; jj += THREADS_PER_ROW) |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
// accumulate local sums |
||||
|
for(uint_fast64_t jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_ROW) |
||||
|
if(Minimize) { |
||||
|
localMinMaxElement = min(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement > y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} else { |
||||
|
localMinMaxElement = max(localMinMaxElement, y[jj]); |
||||
|
//localMinMaxElement = (localMinMaxElement < y[jj]) ? y[jj] : localMinMaxElement; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// store local sum in shared memory |
||||
|
sdata[threadIdx.x] = localMinMaxElement; |
||||
|
|
||||
|
// reduce local min/max to row min/max |
||||
|
if (Minimize) { |
||||
|
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement > sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
||||
|
|
||||
|
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 16]); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 8]); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 4]); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 2]); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = min(localMinMaxElement, sdata[threadIdx.x + 1]); |
||||
|
} else { |
||||
|
/*if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 16]) ? sdata[threadIdx.x + 16] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 8]) ? sdata[threadIdx.x + 8] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 4]) ? sdata[threadIdx.x + 4] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 2]) ? sdata[threadIdx.x + 2] : localMinMaxElement); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = ((localMinMaxElement < sdata[threadIdx.x + 1]) ? sdata[threadIdx.x + 1] : localMinMaxElement);*/ |
||||
|
if (THREADS_PER_ROW > 16) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 16]); |
||||
|
if (THREADS_PER_ROW > 8) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 8]); |
||||
|
if (THREADS_PER_ROW > 4) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 4]); |
||||
|
if (THREADS_PER_ROW > 2) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 2]); |
||||
|
if (THREADS_PER_ROW > 1) sdata[threadIdx.x] = localMinMaxElement = max(localMinMaxElement, sdata[threadIdx.x + 1]); |
||||
|
} |
||||
|
|
||||
|
// first thread writes the result |
||||
|
if (thread_lane == 0) |
||||
|
x[row] = sdata[threadIdx.x]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
template <bool Minimize, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) |
||||
|
{ |
||||
|
float __minMaxInitializer = -std::numeric_limits<float>::max(); |
||||
|
if (Minimize) { |
||||
|
__minMaxInitializer = std::numeric_limits<float>::max(); |
||||
|
} |
||||
|
const float minMaxInitializer = __minMaxInitializer; |
||||
|
|
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
storm_cuda_opt_vector_reduce_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, Minimize> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, nondeterministicChoiceIndices, x, y, minMaxInitializer); |
||||
|
} |
||||
|
|
||||
|
template <bool Minimize> |
||||
|
void storm_cuda_opt_vector_reduce_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * nondeterministicChoiceIndices, float * x, const float * y) |
||||
|
{ |
||||
|
const uint_fast64_t rows_per_group = num_entries / num_rows; |
||||
|
|
||||
|
if (rows_per_group <= 2) { __storm_cuda_opt_vector_reduce_float<Minimize, 2>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 4) { __storm_cuda_opt_vector_reduce_float<Minimize, 4>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 8) { __storm_cuda_opt_vector_reduce_float<Minimize, 8>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
if (rows_per_group <= 16) { __storm_cuda_opt_vector_reduce_float<Minimize,16>(num_rows, nondeterministicChoiceIndices, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_vector_reduce_float<Minimize,32>(num_rows, nondeterministicChoiceIndices, x, y); |
||||
|
} |
||||
|
|
||||
|
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
||||
|
{ |
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
if (UseCache) |
||||
|
bind_x(x); |
||||
|
|
||||
|
storm_cuda_opt_spmv_csr_vector_kernel_float<VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
|
||||
|
if (UseCache) |
||||
|
unbind_x(x); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_opt_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_opt_spmv_csr_vector_tex(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const float * matrixColumnIndicesAndValues, const float* x, float* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_opt_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_opt_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_opt_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_opt_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_opt_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndicesAndValues, x, y); |
||||
|
} |
||||
|
|
||||
|
// NON-OPT |
||||
|
|
||||
|
template <bool UseCache, unsigned int THREADS_PER_VECTOR> |
||||
|
void __storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
||||
|
{ |
||||
|
const size_t THREADS_PER_BLOCK = 128; |
||||
|
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; |
||||
|
|
||||
|
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0); |
||||
|
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK)); |
||||
|
|
||||
|
if (UseCache) |
||||
|
bind_x(x); |
||||
|
|
||||
|
spmv_csr_vector_kernel<uint_fast64_t, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>> |
||||
|
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
|
||||
|
if (UseCache) |
||||
|
unbind_x(x); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_spmv_csr_vector_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_spmv_csr_vector_float<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
} |
||||
|
|
||||
|
void storm_cuda_spmv_csr_vector_tex_float(const uint_fast64_t num_rows, const uint_fast64_t num_entries, const uint_fast64_t * matrixRowIndices, const uint_fast64_t * matrixColumnIndices, const float * matrixValues, const float* x, float* y) |
||||
|
{ |
||||
|
const uint_fast64_t nnz_per_row = num_entries / num_rows; |
||||
|
|
||||
|
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector_float<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector_float<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector_float<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector_float<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; } |
||||
|
|
||||
|
__storm_cuda_spmv_csr_vector_float<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); |
||||
|
} |
||||
|
|
||||
|
} // end namespace device |
||||
|
} // end namespace detail |
||||
|
} // end namespace cusp |
@ -0,0 +1,39 @@ |
|||||
|
#include <iostream> |
||||
|
#include <chrono> |
||||
|
|
||||
|
__global__ void cuda_kernel_kernelSwitchTest(int const * const A, int * const B) { |
||||
|
*B = *A; |
||||
|
} |
||||
|
|
||||
|
void kernelSwitchTest(size_t N) { |
||||
|
int* deviceIntA; |
||||
|
int* deviceIntB; |
||||
|
|
||||
|
if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Allocate space on the device |
||||
|
auto start_time = std::chrono::high_resolution_clock::now(); |
||||
|
for (int i = 0; i < N; ++i) { |
||||
|
cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB); |
||||
|
} |
||||
|
auto end_time = std::chrono::high_resolution_clock::now(); |
||||
|
std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl; |
||||
|
std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl; |
||||
|
|
||||
|
// Free memory on device |
||||
|
if (cudaFree(deviceIntA) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
if (cudaFree(deviceIntB) != cudaSuccess) { |
||||
|
std::cout << "Error in cudaFree!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
} |
@ -0,0 +1 @@ |
|||||
|
void kernelSwitchTest(size_t N); |
@ -0,0 +1,33 @@ |
|||||
|
#include "utility.h" |
||||
|
|
||||
|
#include <cuda_runtime.h> |
||||
|
|
||||
|
size_t getFreeCudaMemory() { |
||||
|
size_t freeMemory; |
||||
|
size_t totalMemory; |
||||
|
cudaMemGetInfo(&freeMemory, &totalMemory); |
||||
|
|
||||
|
return freeMemory; |
||||
|
} |
||||
|
|
||||
|
size_t getTotalCudaMemory() { |
||||
|
size_t freeMemory; |
||||
|
size_t totalMemory; |
||||
|
cudaMemGetInfo(&freeMemory, &totalMemory); |
||||
|
|
||||
|
return totalMemory; |
||||
|
} |
||||
|
|
||||
|
bool resetCudaDevice() { |
||||
|
cudaError_t result = cudaDeviceReset(); |
||||
|
return (result == cudaSuccess); |
||||
|
} |
||||
|
|
||||
|
int getRuntimeCudaVersion() { |
||||
|
int result = -1; |
||||
|
cudaError_t errorResult = cudaRuntimeGetVersion(&result); |
||||
|
if (errorResult != cudaSuccess) { |
||||
|
return -1; |
||||
|
} |
||||
|
return result; |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
#ifndef STORM_CUDAFORSTORM_UTILITY_H_ |
||||
|
#define STORM_CUDAFORSTORM_UTILITY_H_ |
||||
|
|
||||
|
// Library exports |
||||
|
#include "cudaForStorm.h" |
||||
|
|
||||
|
size_t getFreeCudaMemory(); |
||||
|
size_t getTotalCudaMemory(); |
||||
|
bool resetCudaDevice(); |
||||
|
int getRuntimeCudaVersion(); |
||||
|
|
||||
|
#endif // STORM_CUDAFORSTORM_UTILITY_H_ |
@ -0,0 +1,28 @@ |
|||||
|
#include "version.h" |
||||
|
|
||||
|
#include "storm-cudaplugin-config.h" |
||||
|
|
||||
|
size_t getStormCudaPluginVersionMajor() { |
||||
|
return STORM_CUDAPLUGIN_VERSION_MAJOR; |
||||
|
} |
||||
|
|
||||
|
size_t getStormCudaPluginVersionMinor() { |
||||
|
return STORM_CUDAPLUGIN_VERSION_MINOR; |
||||
|
} |
||||
|
|
||||
|
size_t getStormCudaPluginVersionPatch() { |
||||
|
return STORM_CUDAPLUGIN_VERSION_PATCH; |
||||
|
} |
||||
|
|
||||
|
size_t getStormCudaPluginVersionCommitsAhead() { |
||||
|
return STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD; |
||||
|
} |
||||
|
|
||||
|
const char* getStormCudaPluginVersionHash() { |
||||
|
static const std::string versionHash = STORM_CUDAPLUGIN_VERSION_HASH; |
||||
|
return versionHash.c_str(); |
||||
|
} |
||||
|
|
||||
|
bool getStormCudaPluginVersionIsDirty() { |
||||
|
return ((STORM_CUDAPLUGIN_VERSION_DIRTY) != 0); |
||||
|
} |
@ -0,0 +1,16 @@ |
|||||
|
#ifndef STORM_CUDAFORSTORM_VERSION_H_ |
||||
|
#define STORM_CUDAFORSTORM_VERSION_H_ |
||||
|
|
||||
|
// Library exports |
||||
|
#include "cudaForStorm.h" |
||||
|
|
||||
|
#include <string> |
||||
|
|
||||
|
size_t getStormCudaPluginVersionMajor(); |
||||
|
size_t getStormCudaPluginVersionMinor(); |
||||
|
size_t getStormCudaPluginVersionPatch(); |
||||
|
size_t getStormCudaPluginVersionCommitsAhead(); |
||||
|
const char* getStormCudaPluginVersionHash(); |
||||
|
bool getStormCudaPluginVersionIsDirty(); |
||||
|
|
||||
|
#endif // STORM_CUDAFORSTORM_VERSION_H_ |
@ -0,0 +1,21 @@ |
|||||
|
/* |
||||
|
* StoRM - Build-in Options |
||||
|
* |
||||
|
* This file is parsed by CMake during makefile generation |
||||
|
*/ |
||||
|
|
||||
|
#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
||||
|
#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
||||
|
|
||||
|
// Version Information |
||||
|
#define STORM_CUDAPLUGIN_VERSION_MAJOR @STORM_CUDAPLUGIN_VERSION_MAJOR@ // The major version of StoRM |
||||
|
#define STORM_CUDAPLUGIN_VERSION_MINOR @STORM_CUDAPLUGIN_VERSION_MINOR@ // The minor version of StoRM |
||||
|
#define STORM_CUDAPLUGIN_VERSION_PATCH @STORM_CUDAPLUGIN_VERSION_PATCH@ // The patch version of StoRM |
||||
|
#define STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD @STORM_CUDAPLUGIN_VERSION_COMMITS_AHEAD@ // How many commits passed since the tag was last set |
||||
|
#define STORM_CUDAPLUGIN_VERSION_HASH "@STORM_CUDAPLUGIN_VERSION_HASH@" // The short hash of the git commit this build is bases on |
||||
|
#define STORM_CUDAPLUGIN_VERSION_DIRTY @STORM_CUDAPLUGIN_VERSION_DIRTY@ // 0 iff there no files were modified in the checkout, 1 else |
||||
|
|
||||
|
// Whether the size of float in a pair<uint_fast64_t, float> is expanded to 64bit |
||||
|
#@STORM_CUDAPLUGIN_FLOAT_64BIT_ALIGN_DEF@ STORM_CUDAPLUGIN_HAVE_64BIT_FLOAT_ALIGNMENT |
||||
|
|
||||
|
#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_ |
Write
Preview
Loading…
Cancel
Save
Reference in new issue