Browse Source

Compiles with CUSP :)

Former-commit-id: 78555303bf
tempestpy_adaptions
PBerger 11 years ago
parent
commit
71e077f420
  1. 55
      resources/cmake/FindCusp.cmake
  2. 9
      resources/cmake/FindThrust.cmake
  3. 64
      resources/cudaForStorm/CMakeAlignmentCheck.cpp
  4. 91
      resources/cudaForStorm/CMakeLists.txt
  5. 83
      resources/cudaForStorm/srcCuda/basicValueIteration.cu
  6. 2
      resources/cudaForStorm/srcCuda/basicValueIteration.h
  7. 83
      resources/cudaForStorm/srcCuda/cuspExtension.h
  8. 13
      resources/cudaForStorm/storm-cudaplugin-config.h.in

55
resources/cmake/FindCusp.cmake

@ -0,0 +1,55 @@
#
# FindCusp
#
# This module finds the CUSP header files and extracts their version. It
# sets the following variables.
#
# CUSP_INCLUDE_DIR - Include directory for cusp header files. (All header
# files will actually be in the cusp subdirectory.)
# CUSP_VERSION - Version of cusp in the form "major.minor.patch".
#
# CUSP_FOUND - Indicates whether Cusp has been found
#
find_path(CUSP_INCLUDE_DIR
HINTS
/usr/include/cusp
/usr/local/include
/usr/local/cusp/include
${CUSP_INCLUDE_DIRS}
${CUSP_HINT}
NAMES cusp/version.h
DOC "Cusp headers"
)
if(CUSP_INCLUDE_DIR)
list(REMOVE_DUPLICATES CUSP_INCLUDE_DIR)
endif(CUSP_INCLUDE_DIR)
# Find cusp version
file(STRINGS ${CUSP_INCLUDE_DIR}/cusp/version.h
version
REGEX "#define CUSP_VERSION[ \t]+([0-9x]+)"
)
string(REGEX REPLACE
"#define CUSP_VERSION[ \t]+"
""
version
"${version}"
)
#define CUSP_MAJOR_VERSION (CUSP_VERSION / 100000)
#define CUSP_MINOR_VERSION (CUSP_VERSION / 100 % 1000)
#define CUSP_SUBMINOR_VERSION (CUSP_VERSION % 100)
math(EXPR CUSP_MAJOR_VERSION "${version} / 100000")
math(EXPR CUSP_MINOR_VERSION "${version} / 100 % 1000")
math(EXPR CUSP_PATCH_VERSION "${version} % 100")
set(CUSP_VERSION "${CUSP_MAJOR_VERSION}.${CUSP_MINOR_VERSION}.${CUSP_PATCH_VERSION}")
# Check for required components
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Cusp REQUIRED_VARS CUSP_INCLUDE_DIR VERSION_VAR CUSP_VERSION)
set(CUSP_INCLUDE_DIRS ${CUSP_INCLUDE_DIR})
mark_as_advanced(CUSP_INCLUDE_DIR)

9
resources/cmake/FindThrust.cmake

@ -1,13 +1,15 @@
#
# FindThrust
#
# This module finds the Thrust header files and extrats their version. It
# This module finds the Thrust header files and extracts their version. It
# sets the following variables.
#
# THRUST_INCLUDE_DIR - Include directory for thrust header files. (All header
# files will actually be in the thrust subdirectory.)
# THRUST_VERSION - Version of thrust in the form "major.minor.patch".
#
# Thrust_FOUND - Indicates whether Thrust has been found
#
find_path(THRUST_INCLUDE_DIR
HINTS
@ -44,10 +46,7 @@ set( THRUST_MINOR_VERSION "${minor}")
# Check for required components
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args( Thrust
REQUIRED_VARS THRUST_INCLUDE_DIR
VERSION_VAR THRUST_VERSION
)
find_package_handle_standard_args(Thrust REQUIRED_VARS THRUST_INCLUDE_DIR VERSION_VAR THRUST_VERSION)
set(THRUST_INCLUDE_DIRS ${THRUST_INCLUDE_DIR})
mark_as_advanced(THRUST_INCLUDE_DIR)

64
resources/cudaForStorm/CMakeAlignmentCheck.cpp

@ -0,0 +1,64 @@
/*
* This is component of StoRM - Cuda Plugin to check whether type alignment matches the assumptions done while optimizing the code.
*/
#include <cstdint>
#include <utility>
#include <vector>
#define CONTAINER_SIZE 100ul
template <typename IndexType, typename ValueType>
int checkForAlignmentOfPairTypes(size_t containerSize, IndexType const firstValue, ValueType const secondValue) {
std::vector<std::pair<IndexType, ValueType>>* myVector = new std::vector<std::pair<IndexType, ValueType>>();
for (size_t i = 0; i < containerSize; ++i) {
myVector->push_back(std::make_pair(firstValue, secondValue));
}
size_t myVectorSize = myVector->size();
IndexType* firstStart = &(myVector->at(0).first);
IndexType* firstEnd = &(myVector->at(myVectorSize - 1).first);
ValueType* secondStart = &(myVector->at(0).second);
ValueType* secondEnd = &(myVector->at(myVectorSize - 1).second);
size_t startOffset = reinterpret_cast<size_t>(secondStart) - reinterpret_cast<size_t>(firstStart);
size_t endOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(firstEnd);
size_t firstOffset = reinterpret_cast<size_t>(firstEnd) - reinterpret_cast<size_t>(firstStart);
size_t secondOffset = reinterpret_cast<size_t>(secondEnd) - reinterpret_cast<size_t>(secondStart);
delete myVector;
myVector = nullptr;
if (myVectorSize != containerSize) {
return -2;
}
// Check for alignment:
// Requirement is that the pairs are aligned like: first, second, first, second, first, second, ...
if (sizeof(IndexType) != sizeof(ValueType)) {
return -3;
}
if (startOffset != sizeof(IndexType)) {
return -4;
}
if (endOffset != sizeof(IndexType)) {
return -5;
}
if (firstOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
return -6;
}
if (secondOffset != ((sizeof(IndexType) + sizeof(ValueType)) * (myVectorSize - 1))) {
return -7;
}
return 0;
}
int main(int argc, char* argv[]) {
int result = 0;
result = checkForAlignmentOfPairTypes<uint_fast64_t, double>(CONTAINER_SIZE, 42, 3.14);
if (result != 0) {
return result;
}
return 0;
}

91
resources/cudaForStorm/CMakeLists.txt

@ -35,9 +35,12 @@ set(STORM_LIB_INSTALL_DIR "${PROJECT_SOURCE_DIR}/../../build/cudaForStorm" CACHE
# Add the resources/cmake folder to Module Search Path for FindTBB.cmake
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../cmake/")
# Set the hint for CUSP
set(CUSP_HINT "${PROJECT_SOURCE_DIR}/../3rdparty/cusplibrary")
find_package(CUDA REQUIRED)
find_package(Cusp REQUIRED)
find_package(Doxygen REQUIRED)
find_package(Threads REQUIRED)
find_package(Thrust REQUIRED)
# If the DEBUG option was turned on, we will target a debug version and a release version otherwise
@ -115,10 +118,23 @@ endif()
##
#############################################################
# Test for type alignment
try_run(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT
${PROJECT_BINARY_DIR} "${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp"
COMPILE_OUTPUT_VARIABLE OUTPUT_TEST_VAR
)
if(NOT STORM_CUDA_COMPILE_RESULT_TYPEALIGNMENT)
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not test type alignment, there was an Error while compiling the file ${PROJECT_SOURCE_DIR}/CMakeAlignmentCheck.cpp: ${OUTPUT_TEST_VAR}")
elseif(STORM_CUDA_RUN_RESULT_TYPEALIGNMENT EQUAL 0)
message(STATUS "StoRM (CudaPlugin) - Result of Type Alignment Check: OK.")
else()
message(FATAL_ERROR "StoRM (CudaPlugin) - Result of Type Alignment Check: FAILED (Code ${STORM_CUDA_RUN_RESULT_TYPEALIGNMENT})")
endif()
# Configure a header file to pass some of the CMake settings to the source code
configure_file (
"${PROJECT_SOURCE_DIR}/../../storm-config.h.in"
"${PROJECT_BINARY_DIR}/include/storm-config.h"
"${PROJECT_SOURCE_DIR}/storm-cudaplugin-config.h.in"
"${PROJECT_BINARY_DIR}/include/storm-cudaplugin-config.h"
)
# Add the binary dir include directory for storm-config.h
include_directories("${PROJECT_BINARY_DIR}/include")
@ -161,22 +177,6 @@ endif(ADDITIONAL_LINK_DIRS)
#############################################################
###############################################################################
## #
## Executable Creation #
## #
## All link_directories() calls MUST be made before this point #
## #
###############################################################################
include (GenerateExportHeader)
#add_library(cudaForStorm SHARED ${CUDAFORSTORM_HEADERS} ${CUDAFORSTORM_SOURCES})
#GENERATE_EXPORT_HEADER( cudaForStorm
# BASE_NAME cudaForStorm
# EXPORT_MACRO_NAME cudaForStorm_EXPORT
# EXPORT_FILE_NAME include/cudaForStorm_Export.h
# STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
#)
#############################################################
##
@ -184,18 +184,6 @@ include (GenerateExportHeader)
##
#############################################################
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
cuda_add_library(cudaForStorm SHARED
${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
OPTIONS -DSTUFF="" -arch=sm_30
RELEASE -DNDEBUG
DEBUG -g -DDEBUG
)
GENERATE_EXPORT_HEADER( cudaForStorm
BASE_NAME cudaForStorm
EXPORT_MACRO_NAME cudaForStorm_EXPORT
EXPORT_FILE_NAME include/cudaForStorm_Export.h
STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
)
#target_link_libraries(cudaLibrary ${CUDA_cusparse_LIBRARY})
#ADD_DEPENDENCIES(cudaForStorm cudaLibrary)
#target_link_libraries(cudaForStorm cudaLibrary)
@ -204,19 +192,52 @@ include_directories(${CUDA_INCLUDE_DIRS})
#############################################################
##
## Threads
## CUSP
##
#############################################################
include_directories(${THREADS_INCLUDE_DIRS})
target_link_libraries(cudaForStorm ${CMAKE_THREAD_LIBS_INIT})
if(CUSP_FOUND)
include_directories(${CUSP_INCLUDE_DIR})
cuda_include_directories(${CUSP_INCLUDE_DIR})
message(STATUS "StoRM (CudaPlugin) - Found CUSP Version ${CUSP_VERSION} in location ${CUSP_INCLUDE_DIR}")
else()
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find CUSP!")
endif()
#############################################################
##
## Thrust
##
#############################################################
if(THRUST_FOUND)
include_directories(${THRUST_INCLUDE_DIR})
message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION}")
cuda_include_directories(${THRUST_INCLUDE_DIR})
message(STATUS "StoRM (CudaPlugin) - Found Thrust Version ${THRUST_VERSION} in location ${THRUST_INCLUDE_DIR}")
else()
message(FATAL_ERROR "StoRM (CudaPlugin) - Could not find Thrust! Check your CUDA installation.")
endif()
###############################################################################
## #
## Executable Creation #
## #
## All link_directories() calls AND include_directories() calls #
## MUST be made before this point #
## #
###############################################################################
include (GenerateExportHeader)
cuda_add_library(cudaForStorm SHARED
${CUDAFORSTORM_CUDA_SOURCES} ${CUDAFORSTORM_CUDA_HEADERS}
OPTIONS -DSTUFF="" -arch=sm_30
RELEASE -DNDEBUG
DEBUG -g -DDEBUG
)
GENERATE_EXPORT_HEADER( cudaForStorm
BASE_NAME cudaForStorm
EXPORT_MACRO_NAME cudaForStorm_EXPORT
EXPORT_FILE_NAME include/cudaForStorm_Export.h
STATIC_DEFINE cudaForStorm_BUILT_AS_STATIC
)
if (MSVC)
# Add the DebugHelper DLL

83
resources/cudaForStorm/srcCuda/basicValueIteration.cu

@ -6,61 +6,54 @@
#include <cuda_runtime.h>
#include "cusparse_v2.h"
#include "cuspExtension.h"
__global__ void cuda_kernel_basicValueIteration_mvReduce(int const * const A, int * const B) {
*B = *A;
}
void cudaForStormTestFunction(int a, int b) {
std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
}
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
if (sizeof(double) != sizeof(uint_fast64_t)) {
std::cout << "FATAL ERROR - Internal Sizes of Double and uint_fast64_t do NOT match, CUDA acceleration not possible!" << std::endl;
return;
}
uint_fast64_t* device_matrixRowIndices = nullptr;
uint_fast64_t* device_matrixColIndicesAndValues = nullptr;
double* device_x = nullptr;
double* device_b = nullptr;
double* device_multiplyResult = nullptr;
uint_fast64_t* device_nondeterministicChoiceIndices = nullptr;
template <typename IndexType, typename ValueType>
void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<IndexType> const& matrixRowIndices, std::vector<std::pair<IndexType, ValueType>> columnIndicesAndValues, std::vector<ValueType>& x, std::vector<ValueType> const& b, std::vector<IndexType> const& nondeterministicChoiceIndices) {
IndexType* device_matrixRowIndices = nullptr;
IndexType* device_matrixColIndicesAndValues = nullptr;
ValueType* device_x = nullptr;
ValueType* device_b = nullptr;
ValueType* device_multiplyResult = nullptr;
IndexType* device_nondeterministicChoiceIndices = nullptr;
cudaError_t cudaMallocResult;
cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixRowIndices, matrixRowIndices.size());
cudaMallocResult = cudaMalloc<IndexType>(&device_matrixRowIndices, matrixRowIndices.size());
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Row Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
cudaMallocResult = cudaMalloc<IndexType>(&device_matrixColIndicesAndValues, columnIndicesAndValues.size() * 2);
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Matrix Column Indices and Values, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<double>(&device_x, x.size());
cudaMallocResult = cudaMalloc<ValueType>(&device_x, x.size());
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector x, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<double>(&device_b, b.size());
cudaMallocResult = cudaMalloc<ValueType>(&device_b, b.size());
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector b, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<double>(&device_multiplyResult, b.size());
cudaMallocResult = cudaMalloc<ValueType>(&device_multiplyResult, b.size());
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Vector multiplyResult, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
}
cudaMallocResult = cudaMalloc<uint_fast64_t>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
cudaMallocResult = cudaMalloc<IndexType>(&device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.size());
if (cudaMallocResult != cudaSuccess) {
std::cout << "Could not allocate memory for Nondeterministic Choice Indices, Error Code " << cudaMallocResult << "." << std::endl;
goto cleanup;
@ -69,31 +62,31 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
// Memory allocated, copy data to device
cudaError_t cudaCopyResult;
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(uint_fast64_t) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixRowIndices, matrixRowIndices.data(), sizeof(IndexType) * matrixRowIndices.size(), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Row Indices, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(uint_fast64_t) * columnIndicesAndValues.size()) + (sizeof(double) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_matrixColIndicesAndValues, columnIndicesAndValues.data(), (sizeof(IndexType) * columnIndicesAndValues.size()) + (sizeof(ValueType) * columnIndicesAndValues.size()), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Matrix Column Indices and Values, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(double) * x.size(), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_x, x.data(), sizeof(ValueType) * x.size(), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector x, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(double) * b.size(), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_b, b.data(), sizeof(ValueType) * b.size(), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
}
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(uint_fast64_t) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
cudaCopyResult = cudaMemcpy(device_nondeterministicChoiceIndices, nondeterministicChoiceIndices.data(), sizeof(IndexType) * nondeterministicChoiceIndices.size(), cudaMemcpyHostToDevice);
if (cudaCopyResult != cudaSuccess) {
std::cout << "Could not copy data for Vector b, Error Code " << cudaCopyResult << std::endl;
goto cleanup;
@ -101,6 +94,7 @@ void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::ve
// Data is on device, start Kernel
// All code related to freeing memory and clearing up the device
cleanup:
if (device_matrixRowIndices != nullptr) {
@ -148,35 +142,14 @@ cleanup:
}
/*
void kernelSwitchTest(size_t N) {
int* deviceIntA;
int* deviceIntB;
* Declare and implement all exported functions for these Kernels here
*
*/
if (cudaMalloc((void**)&deviceIntA, sizeof(int)) != cudaSuccess) {
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
return;
}
if (cudaMalloc((void**)&deviceIntB, sizeof(int)) != cudaSuccess) {
std::cout << "Error in cudaMalloc while allocating " << sizeof(int) << " Bytes!" << std::endl;
return;
}
// Allocate space on the device
auto start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N; ++i) {
cuda_kernel_kernelSwitchTest<<<1,1>>>(deviceIntA, deviceIntB);
void cudaForStormTestFunction(int a, int b) {
std::cout << "Cuda for Storm: a + b = " << (a+b) << std::endl;
}
auto end_time = std::chrono::high_resolution_clock::now();
std::cout << "Switching the Kernel " << N << " times took " << std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() << "micros" << std::endl;
std::cout << "Resulting in " << (std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count() / ((double)(N))) << "Microseconds per Kernel Switch" << std::endl;
// Free memory on device
if (cudaFree(deviceIntA) != cudaSuccess) {
std::cout << "Error in cudaFree!" << std::endl;
return;
}
if (cudaFree(deviceIntB) != cudaSuccess) {
std::cout << "Error in cudaFree!" << std::endl;
return;
void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices) {
basicValueIteration_mvReduce<uint_fast64_t, double>(maxIterationCount, matrixRowIndices, columnIndicesAndValues, x, b, nondeterministicChoiceIndices);
}
}*/

2
resources/cudaForStorm/srcCuda/basicValueIteration.h

@ -6,4 +6,4 @@
#include "cudaForStorm_Export.h"
cudaForStorm_EXPORT void cudaForStormTestFunction(int a, int b);
cudaForStorm_EXPORT void basicValueIteration_mvReduce(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);
cudaForStorm_EXPORT void basicValueIteration_mvReduce_uint64_double(uint_fast64_t const maxIterationCount, std::vector<uint_fast64_t> const& matrixRowIndices, std::vector<std::pair<uint_fast64_t, double>> columnIndicesAndValues, std::vector<double>& x, std::vector<double> const& b, std::vector<uint_fast64_t> const& nondeterministicChoiceIndices);

83
resources/cudaForStorm/srcCuda/cuspExtension.h

@ -0,0 +1,83 @@
/*
* This is an extension of the original CUSP csr_vector.h SPMV implementation.
* It is based on the Code and incorporates changes as to cope with the details
* of the StoRM code.
* As this is mostly copy & past, the original license still applies.
*/
/*
* Copyright 2008-2009 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cusp/detail/device/spmv/csr_vector.h>
namespace cusp
{
namespace detail
{
namespace device
{
template <bool UseCache, unsigned int THREADS_PER_VECTOR, typename IndexType, typename ValueType>
void __storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const size_t THREADS_PER_BLOCK = 128;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(num_rows, VECTORS_PER_BLOCK));
if (UseCache)
bind_x(x);
spmv_csr_vector_kernel<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR, UseCache> <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
if (UseCache)
unbind_x(x);
}
template <typename IndexType, typename ValueType>
void storm_cuda_spmv_csr_vector(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector<false, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector<false, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector<false, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<false,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector<false,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
template <typename IndexType, typename ValueType>
void storm_cuda_spmv_csr_vector_tex(const IndexType num_rows, const IndexType num_entries, const IndexType * matrixRowIndices, const IndexType * matrixColumnIndices, const ValueType * matrixValues, const ValueType* x, ValueType* y)
{
const IndexType nnz_per_row = num_entries / num_rows;
if (nnz_per_row <= 2) { __storm_cuda_spmv_csr_vector<true, 2>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 4) { __storm_cuda_spmv_csr_vector<true, 4>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 8) { __storm_cuda_spmv_csr_vector<true, 8>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
if (nnz_per_row <= 16) { __storm_cuda_spmv_csr_vector<true,16>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y); return; }
__storm_cuda_spmv_csr_vector<true,32>(num_rows, matrixRowIndices, matrixColumnIndices, matrixValues, x, y);
}
} // end namespace device
} // end namespace detail
} // end namespace cusp

13
resources/cudaForStorm/storm-cudaplugin-config.h.in

@ -0,0 +1,13 @@
/*
* StoRM - Build-in Options
*
* This file is parsed by CMake during makefile generation
* It contains information such as the base path to the test/example data
*/
#ifndef STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
#define STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
#endif // STORM_CUDAPLUGIN_GENERATED_STORMCONFIG_H_
Loading…
Cancel
Save