Skip to content

Commit

Permalink
review update
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Nov 14, 2019
1 parent 6964af0 commit 1713912
Show file tree
Hide file tree
Showing 12 changed files with 100 additions and 50 deletions.
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,6 @@ if (GINKGO_BUILD_HIP)
endif()
endif()

set(GINKGO_HIP_PLATFORM ${GINKGO_HIP_PLATFORM} PARENT_SCOPE)

configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)

Expand Down
22 changes: 22 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,28 @@ function(ginkgo_benchmark_cusp_linops name)
endif()
endfunction()

function(ginkgo_benchmark_hipsp_linops name)
target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
set_target_properties("${name}" PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
find_package(HIP REQUIRED)
find_package(hipsparse REQUIRED)
target_include_directories("${name}" SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})

if(GINKGO_HIP_PLATFORM MATCHES "hcc")
ginkgo_hip_ban_link_hcflag(hcc::hccrt)
ginkgo_hip_ban_link_hcflag(hcc::hc_am)
ginkgo_hip_ban_link_hcflag(hcc::mcwamp)
ginkgo_hip_ban_compile_hcflag(hcc::hccrt)
ginkgo_hip_ban_compile_hcflag(hcc::hc_am)
ginkgo_hip_ban_compile_hcflag(hcc::mcwamp)
target_link_libraries("${name}" hip::device)
else()
target_link_libraries("${name}" ${HIP_CUDA_LIBRARIES})
endif()
target_link_libraries("${name}" ${HIPSPARSE_LIBRARIES})
endfunction()

add_subdirectory(conversions)
add_subdirectory(matrix_generator)
add_subdirectory(matrix_statistics)
Expand Down
3 changes: 3 additions & 0 deletions benchmark/solver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ add_executable(solver solver.cpp)
target_link_libraries(solver ginkgo gflags rapidjson)
if (GINKGO_BUILD_CUDA)
ginkgo_benchmark_cusp_linops(solver)
endif()
if (GINKGO_BUILD_HIP)
ginkgo_benchmark_hipsp_linops(solver)
endif()
11 changes: 1 addition & 10 deletions benchmark/spmv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,5 @@ if (GINKGO_BUILD_CUDA)
ginkgo_benchmark_cusp_linops(spmv)
endif()
if (GINKGO_BUILD_HIP)
target_compile_definitions(spmv PRIVATE HAS_HIP=1)
EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
set_target_properties(spmv PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
target_include_directories(spmv SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
if(GINKGO_HIP_PLATFORM MATCHES "hcc")
target_link_libraries(spmv "${HIP_PATH}/lib/libhip_hcc.so" )
else()
target_link_libraries(spmv ${CUDA_LIBRARIES})
endif()
target_link_libraries(spmv "${HIPSPARSE_PATH}/lib/libhipsparse.so")
ginkgo_benchmark_hipsp_linops(spmv)
endif()
12 changes: 12 additions & 0 deletions cmake/build_helpers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,15 @@ endmacro()
macro(ginkgo_switch_to_windows_dynamic lang)
ginkgo_switch_windows_link(${lang} "MT" "MD")
endmacro()

macro(ginkgo_hip_ban_link_hcflag target)
get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_LINK_LIBRARIES)
string(REPLACE "-hc " "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
set_target_properties(${target} PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_TARGET_NEW_ILL}")
endmacro()

macro(ginkgo_hip_ban_compile_hcflag target)
get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_COMPILE_OPTIONS)
string(REPLACE "-hc" "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_OPTIONS "${GINKGO_TARGET_NEW_ILL}")
endmacro()
27 changes: 15 additions & 12 deletions cuda/components/cooperative_groups.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


namespace gko {
namespace kernels {
namespace cuda {


/**
Expand All @@ -59,7 +61,7 @@ namespace gko {
* A cooperative group (both from standard CUDA and from Ginkgo) is not a
* specific type, but a concept. That is, any type satisfying the interface
* imposed by the cooperative groups API is considered a cooperative
* group (a.k.a. "duck typing"). To maximize the generality of components than
* group (a.k.a. "duck typing"). To maximize the generality of components that
* need cooperative groups, instead of creating the group manually, consider
* requesting one as an input parameter. Make sure its type is a template
* parameter to maximize the set of groups for which your algorithm can be
Expand Down Expand Up @@ -228,19 +230,18 @@ public:
__device__ unsigned thread_rank() const noexcept { return data_.rank; }

private:
// clang-format off
__device__ grid_group()
: data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y *
gridDim.z,
threadIdx.x +
blockDim.x *
(threadIdx.y +
blockDim.y *
(threadIdx.z +
blockDim.z *
(blockIdx.x +
gridDim.x *
(blockIdx.y + gridDim.y * blockIdx.z))))}
: data_{
blockDim.x * blockDim.y * blockDim.z *
gridDim.x * gridDim.y * gridDim.z,
threadIdx.x + blockDim.x *
(threadIdx.y + blockDim.y *
(threadIdx.z + blockDim.z *
(blockIdx.x + gridDim.x *
(blockIdx.y + gridDim.y * blockIdx.z))))}
{}
// clang-format on

struct alignas(8) {
unsigned size;
Expand Down Expand Up @@ -459,6 +460,8 @@ __device__ __forceinline__ thread_block_tile<Size> tiled_partition(


} // namespace group
} // namespace cuda
} // namespace kernels
} // namespace gko


Expand Down
4 changes: 2 additions & 2 deletions cuda/components/format_conversion.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ namespace host_kernel {
/**
* @internal
*
* It calculates the number of warps used in Coo Spmv by GPU architecture and
* the number of stored elements.
* It calculates the number of warps used in Coo Spmv depending on the GPU
* architecture and the number of stored elements.
*/
template <size_type subwarp_size = cuda_config::warp_size>
__host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
Expand Down
10 changes: 10 additions & 0 deletions cuda/matrix/coo_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ namespace {
* @param c the output dense vector
* @param c_stride the stride of the output dense vector
* @param scale the function on the added value
*
* @tparam ValueType type of values stored in the matrix
* @tparam IndexType type of matrix indexes stored in the structure
* @tparam Closure type of the function used to write the result
*/
template <int subwarp_size = cuda_config::warp_size, typename ValueType,
typename IndexType, typename Closure>
Expand Down Expand Up @@ -185,6 +189,10 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
* @param c the output dense vector
* @param c_stride the stride of the output dense vector
* @param scale the function on the added value
*
* @tparam ValueType type of values stored in the matrix
* @tparam IndexType type of matrix indexes stored in the structure
* @tparam Closure type of the function used to write the result
*/
template <typename ValueType, typename IndexType, typename Closure>
__device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
Expand Down Expand Up @@ -360,6 +368,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

namespace kernel {


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
const IndexType *__restrict__ idxs, size_type num_nonzeros,
Expand All @@ -381,6 +390,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
}
}


} // namespace kernel


Expand Down
13 changes: 4 additions & 9 deletions hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ list(APPEND CMAKE_PREFIX_PATH
"${HIPBLAS_PATH}/lib/cmake"
"${HIPSPARSE_PATH}/lib/cmake"
)
# Set CMAKE_MODULE_PATH in PARENT_SCOPE for benchmark
set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)

find_package(HIP REQUIRED)
find_package(hipblas REQUIRED)
Expand All @@ -87,11 +89,6 @@ if (NOT GINKGO_HIP_THRUST_PATH)
message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.")
endif()


set(HIP_INCLUDE_DIRS ${HIP_INCLUDE_DIRS} PARENT_SCOPE)
set(HIPSPARSE_INCLUDE_DIRS ${HIPSPARSE_INCLUDE_DIRS} PARENT_SCOPE)
set(HIPBLAS_INCLUDE_DIRS ${HIPBLAS_INCLUDE_DIRS} PARENT_SCOPE)
set(GINKGO_HIP_THRUST_PATH ${GINKGO_HIP_THRUST_PATH} PARENT_SCOPE)
set(GINKGO_HIP_SOURCES
base/exception.hip.cpp
base/executor.hip.cpp
Expand Down Expand Up @@ -156,15 +153,13 @@ if(GINKGO_HIP_PLATFORM MATCHES "hcc")

# Ban `-hc` flag as INTERFACE_LINK_LIBRARIES since that is propagated when building
# a static library, and it's definitely not a known option to any compiler.
get_target_property(GINKGO_HCCRT_ILL hcc::hccrt INTERFACE_LINK_LIBRARIES)
string(REPLACE "-hc " "" GINKGO_HCCRT_NEW_ILL "${GINKGO_HCCRT_ILL}")
set_target_properties(hcc::hccrt PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_HCCRT_NEW_ILL}")
ginkgo_hip_ban_link_hcflag(hcc::hccrt)

target_link_libraries(ginkgo_hip PRIVATE hip::device)
elseif(GINKGO_HIP_PLATFORM MATCHES "nvcc")
find_package(CUDA 9.0 REQUIRED)
target_link_libraries(ginkgo_hip PRIVATE ${CUDA_LIBRARIES})
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE)
set(HIP_CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE)
endif()

target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse)
Expand Down
27 changes: 15 additions & 12 deletions hip/components/cooperative_groups.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


namespace gko {
namespace kernels {
namespace hip {


/**
Expand All @@ -59,7 +61,7 @@ namespace gko {
* A cooperative group (both from standard HIP and from Ginkgo) is not a
* specific type, but a concept. That is, any type satisfying the interface
* imposed by the cooperative groups API is considered a cooperative
* group (a.k.a. "duck typing"). To maximize the generality of components than
* group (a.k.a. "duck typing"). To maximize the generality of components that
* need cooperative groups, instead of creating the group manually, consider
* requesting one as an input parameter. Make sure its type is a template
* parameter to maximize the set of groups for which your algorithm can be
Expand Down Expand Up @@ -408,19 +410,18 @@ class grid_group {
__device__ unsigned thread_rank() const noexcept { return data_.rank; }

private:
// clang-format off
__device__ grid_group()
: data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y *
gridDim.z,
threadIdx.x +
blockDim.x *
(threadIdx.y +
blockDim.y *
(threadIdx.z +
blockDim.z *
(blockIdx.x +
gridDim.x *
(blockIdx.y + gridDim.y * blockIdx.z))))}
: data_{
blockDim.x * blockDim.y * blockDim.z *
gridDim.x * gridDim.y * gridDim.z,
threadIdx.x + blockDim.x *
(threadIdx.y + blockDim.y *
(threadIdx.z + blockDim.z *
(blockIdx.x + gridDim.x *
(blockIdx.y + gridDim.y * blockIdx.z))))}
{}
// clang-format on

struct alignas(8) {
unsigned size;
Expand All @@ -436,6 +437,8 @@ __device__ inline grid_group this_grid() { return {}; }


} // namespace group
} // namespace hip
} // namespace kernels
} // namespace gko


Expand Down
7 changes: 4 additions & 3 deletions hip/components/format_conversion.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ namespace host_kernel {
/**
* @internal
*
* It calculates the number of warps used in Coo Spmv by GPU architecture and
* the number of stored elements.
* It calculates the number of warps used in Coo Spmv depending on the GPU
* architecture and the number of stored elements.
*/
template <size_type subwarp_size = hip_config::warp_size>
__host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
const size_type nnz)
{
// One multiprocessor has 4 SIMD
// In GCN (Graphics Core Next), each multiprocessor has 4 SIMD
// Refernce: https://en.wikipedia.org/wiki/Graphics_Core_Next
size_type nwarps_in_hip = exec->get_num_multiprocessor() * 4;
size_type multiple = 8;
if (nnz >= 2000000) {
Expand Down
12 changes: 12 additions & 0 deletions hip/matrix/coo_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ namespace {
* @param c the output dense vector
* @param c_stride the stride of the output dense vector
* @param scale the function on the added value
*
* @tparam ValueType type of values stored in the matrix
* @tparam IndexType type of matrix indexes stored in the structure
* @tparam Closure type of the function used to write the result
*/
template <int subwarp_size = hip_config::warp_size, typename ValueType,
typename IndexType, typename Closure>
Expand Down Expand Up @@ -188,6 +192,10 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
* @param c the output dense vector
* @param c_stride the stride of the output dense vector
* @param scale the function on the added value
*
* @tparam ValueType type of values stored in the matrix
* @tparam IndexType type of matrix indexes stored in the structure
* @tparam Closure type of the function used to write the result
*/
template <typename ValueType, typename IndexType, typename Closure>
__device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
Expand Down Expand Up @@ -296,6 +304,7 @@ void spmv2(std::shared_ptr<const HipExecutor> exec,
const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);

if (nwarps > 0) {
// TODO: b_ncols needs to be tuned.
if (b_ncols < 4) {
const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
int num_lines = ceildiv(nnz, nwarps * hip_config::warp_size);
Expand Down Expand Up @@ -336,6 +345,7 @@ void advanced_spmv2(std::shared_ptr<const HipExecutor> exec,
const auto b_ncols = b->get_size()[1];

if (nwarps > 0) {
// TODO: b_ncols needs to be tuned.
if (b_ncols < 4) {
int num_lines = ceildiv(nnz, nwarps * hip_config::warp_size);
const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
Expand Down Expand Up @@ -367,6 +377,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

namespace kernel {


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
const IndexType *__restrict__ idxs, size_type num_nonzeros,
Expand All @@ -388,6 +399,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
}
}


} // namespace kernel


Expand Down

0 comments on commit 1713912

Please sign in to comment.