review update

ginkgo-project · Nov 14, 2019 · 1713912 · 1713912
1 parent 6964af0
commit 1713912
Show file tree

Hide file tree

Showing 12 changed files with 100 additions and 50 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -147,8 +147,6 @@ if (GINKGO_BUILD_HIP)
  endif()
 endif()
 
-set(GINKGO_HIP_PLATFORM ${GINKGO_HIP_PLATFORM} PARENT_SCOPE)
-
 configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
  ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)
 

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -14,6 +14,28 @@ function(ginkgo_benchmark_cusp_linops name)
  endif()
 endfunction()
 
+function(ginkgo_benchmark_hipsp_linops name)
+ target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
+ EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
+ set_target_properties("${name}" PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
+ find_package(HIP REQUIRED)
+ find_package(hipsparse REQUIRED)
+ target_include_directories("${name}" SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
+
+ if(GINKGO_HIP_PLATFORM MATCHES "hcc")
+ ginkgo_hip_ban_link_hcflag(hcc::hccrt)
+ ginkgo_hip_ban_link_hcflag(hcc::hc_am)
+ ginkgo_hip_ban_link_hcflag(hcc::mcwamp)
+ ginkgo_hip_ban_compile_hcflag(hcc::hccrt)
+ ginkgo_hip_ban_compile_hcflag(hcc::hc_am)
+ ginkgo_hip_ban_compile_hcflag(hcc::mcwamp)
+ target_link_libraries("${name}" hip::device)
+ else()
+ target_link_libraries("${name}" ${HIP_CUDA_LIBRARIES})
+ endif()
+ target_link_libraries("${name}" ${HIPSPARSE_LIBRARIES})
+endfunction()
+
 add_subdirectory(conversions)
 add_subdirectory(matrix_generator)
 add_subdirectory(matrix_statistics)

diff --git a/benchmark/solver/CMakeLists.txt b/benchmark/solver/CMakeLists.txt
@@ -2,4 +2,7 @@ add_executable(solver solver.cpp)
 target_link_libraries(solver ginkgo gflags rapidjson)
 if (GINKGO_BUILD_CUDA)
  ginkgo_benchmark_cusp_linops(solver)
+endif()
+if (GINKGO_BUILD_HIP)
+ ginkgo_benchmark_hipsp_linops(solver)
 endif()
diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt
@@ -4,14 +4,5 @@ if (GINKGO_BUILD_CUDA)
  ginkgo_benchmark_cusp_linops(spmv)
 endif()
 if (GINKGO_BUILD_HIP)
- target_compile_definitions(spmv PRIVATE HAS_HIP=1)
- EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
- set_target_properties(spmv PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
- target_include_directories(spmv SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
- if(GINKGO_HIP_PLATFORM MATCHES "hcc")
- target_link_libraries(spmv "${HIP_PATH}/lib/libhip_hcc.so" )
- else()
- target_link_libraries(spmv ${CUDA_LIBRARIES})
- endif()
- target_link_libraries(spmv "${HIPSPARSE_PATH}/lib/libhipsparse.so")
+ ginkgo_benchmark_hipsp_linops(spmv)
 endif()
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
@@ -101,3 +101,15 @@ endmacro()
 macro(ginkgo_switch_to_windows_dynamic lang)
  ginkgo_switch_windows_link(${lang} "MT" "MD")
 endmacro()
+
+macro(ginkgo_hip_ban_link_hcflag target)
+ get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_LINK_LIBRARIES)
+ string(REPLACE "-hc " "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
+ set_target_properties(${target} PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_TARGET_NEW_ILL}")
+endmacro()
+
+macro(ginkgo_hip_ban_compile_hcflag target)
+ get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_COMPILE_OPTIONS)
+ string(REPLACE "-hc" "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}")
+ set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_OPTIONS "${GINKGO_TARGET_NEW_ILL}")
+endmacro()
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
@@ -41,6 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace kernels {
+namespace cuda {
 
 
 /**
@@ -59,7 +61,7 @@ namespace gko {
  * A cooperative group (both from standard CUDA and from Ginkgo) is not a
  * specific type, but a concept. That is, any type satisfying the interface
  * imposed by the cooperative groups API is considered a cooperative
- * group (a.k.a. "duck typing"). To maximize the generality of components than
+ * group (a.k.a. "duck typing"). To maximize the generality of components that
  * need cooperative groups, instead of creating the group manually, consider
  * requesting one as an input parameter. Make sure its type is a template
  * parameter to maximize the set of groups for which your algorithm can be
@@ -228,19 +230,18 @@ public:
  __device__ unsigned thread_rank() const noexcept { return data_.rank; }
 
 private:
+ // clang-format off
  __device__ grid_group()
- : data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y *
- gridDim.z,
- threadIdx.x +
- blockDim.x *
- (threadIdx.y +
- blockDim.y *
- (threadIdx.z +
- blockDim.z *
- (blockIdx.x +
- gridDim.x *
- (blockIdx.y + gridDim.y * blockIdx.z))))}
+ : data_{
+ blockDim.x * blockDim.y * blockDim.z *
+ gridDim.x * gridDim.y * gridDim.z,
+ threadIdx.x + blockDim.x *
+ (threadIdx.y + blockDim.y *
+ (threadIdx.z + blockDim.z *
+ (blockIdx.x + gridDim.x *
+ (blockIdx.y + gridDim.y * blockIdx.z))))} 
  {}
+ // clang-format on
 
  struct alignas(8) {
  unsigned size;
@@ -459,6 +460,8 @@ __device__ __forceinline__ thread_block_tile<Size> tiled_partition(
 
 
 } // namespace group
+} // namespace cuda
+} // namespace kernels
 } // namespace gko
 
 

diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
@@ -89,8 +89,8 @@ namespace host_kernel {
 /**
  * @internal
  *
- * It calculates the number of warps used in Coo Spmv by GPU architecture and
- * the number of stored elements.
+ * It calculates the number of warps used in Coo Spmv depending on the GPU
+ * architecture and the number of stored elements.
  */
 template <size_type subwarp_size = cuda_config::warp_size>
 __host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,

diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
@@ -88,6 +88,10 @@ namespace {
  * @param c the output dense vector
  * @param c_stride the stride of the output dense vector
  * @param scale the function on the added value
+ * 
+ * @tparam ValueType type of values stored in the matrix
+ * @tparam IndexType type of matrix indexes stored in the structure
+ * @tparam Closure type of the function used to write the result
  */
 template <int subwarp_size = cuda_config::warp_size, typename ValueType,
  typename IndexType, typename Closure>
@@ -185,6 +189,10 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
  * @param c the output dense vector
  * @param c_stride the stride of the output dense vector
  * @param scale the function on the added value
+ * 
+ * @tparam ValueType type of values stored in the matrix
+ * @tparam IndexType type of matrix indexes stored in the structure
+ * @tparam Closure type of the function used to write the result
  */
 template <typename ValueType, typename IndexType, typename Closure>
 __device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
@@ -360,6 +368,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 namespace kernel {
 
+
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
  const IndexType *__restrict__ idxs, size_type num_nonzeros,
@@ -381,6 +390,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
  }
 }
 
+
 } // namespace kernel
 
 

diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
@@ -77,6 +77,8 @@ list(APPEND CMAKE_PREFIX_PATH
  "${HIPBLAS_PATH}/lib/cmake"
  "${HIPSPARSE_PATH}/lib/cmake"
 )
+# Set CMAKE_MODULE_PATH in PARENT_SCOPE for benchmark
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)
 
 find_package(HIP REQUIRED)
 find_package(hipblas REQUIRED)
@@ -87,11 +89,6 @@ if (NOT GINKGO_HIP_THRUST_PATH)
  message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.")
 endif()
 
-
-set(HIP_INCLUDE_DIRS ${HIP_INCLUDE_DIRS} PARENT_SCOPE)
-set(HIPSPARSE_INCLUDE_DIRS ${HIPSPARSE_INCLUDE_DIRS} PARENT_SCOPE)
-set(HIPBLAS_INCLUDE_DIRS ${HIPBLAS_INCLUDE_DIRS} PARENT_SCOPE)
-set(GINKGO_HIP_THRUST_PATH ${GINKGO_HIP_THRUST_PATH} PARENT_SCOPE)
 set(GINKGO_HIP_SOURCES
  base/exception.hip.cpp
  base/executor.hip.cpp
@@ -156,15 +153,13 @@ if(GINKGO_HIP_PLATFORM MATCHES "hcc")
 
  # Ban `-hc` flag as INTERFACE_LINK_LIBRARIES since that is propagated when building
  # a static library, and it's definitely not a known option to any compiler.
- get_target_property(GINKGO_HCCRT_ILL hcc::hccrt INTERFACE_LINK_LIBRARIES)
- string(REPLACE "-hc " "" GINKGO_HCCRT_NEW_ILL "${GINKGO_HCCRT_ILL}")
- set_target_properties(hcc::hccrt PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_HCCRT_NEW_ILL}")
+ ginkgo_hip_ban_link_hcflag(hcc::hccrt)
 
  target_link_libraries(ginkgo_hip PRIVATE hip::device)
 elseif(GINKGO_HIP_PLATFORM MATCHES "nvcc")
  find_package(CUDA 9.0 REQUIRED)
  target_link_libraries(ginkgo_hip PRIVATE ${CUDA_LIBRARIES})
- set(CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE)
+ set(HIP_CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE)
 endif()
 
 target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse)

diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
@@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace kernels {
+namespace hip {
 
 
 /**
@@ -59,7 +61,7 @@ namespace gko {
  * A cooperative group (both from standard HIP and from Ginkgo) is not a
  * specific type, but a concept. That is, any type satisfying the interface
  * imposed by the cooperative groups API is considered a cooperative
- * group (a.k.a. "duck typing"). To maximize the generality of components than
+ * group (a.k.a. "duck typing"). To maximize the generality of components that
  * need cooperative groups, instead of creating the group manually, consider
  * requesting one as an input parameter. Make sure its type is a template
  * parameter to maximize the set of groups for which your algorithm can be
@@ -408,19 +410,18 @@ class grid_group {
  __device__ unsigned thread_rank() const noexcept { return data_.rank; }
 
 private:
+ // clang-format off
  __device__ grid_group()
- : data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y *
- gridDim.z,
- threadIdx.x +
- blockDim.x *
- (threadIdx.y +
- blockDim.y *
- (threadIdx.z +
- blockDim.z *
- (blockIdx.x +
- gridDim.x *
- (blockIdx.y + gridDim.y * blockIdx.z))))}
+ : data_{
+ blockDim.x * blockDim.y * blockDim.z *
+ gridDim.x * gridDim.y * gridDim.z,
+ threadIdx.x + blockDim.x *
+ (threadIdx.y + blockDim.y *
+ (threadIdx.z + blockDim.z *
+ (blockIdx.x + gridDim.x *
+ (blockIdx.y + gridDim.y * blockIdx.z))))} 
  {}
+ // clang-format on
 
  struct alignas(8) {
  unsigned size;
@@ -436,6 +437,8 @@ __device__ inline grid_group this_grid() { return {}; }
 
 
 } // namespace group
+} // namespace hip
+} // namespace kernels
 } // namespace gko
 
 

diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
@@ -49,14 +49,15 @@ namespace host_kernel {
 /**
  * @internal
  *
- * It calculates the number of warps used in Coo Spmv by GPU architecture and
- * the number of stored elements.
+ * It calculates the number of warps used in Coo Spmv depending on the GPU
+ * architecture and the number of stored elements.
  */
 template <size_type subwarp_size = hip_config::warp_size>
 __host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
  const size_type nnz)
 {
- // One multiprocessor has 4 SIMD
+ // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD
+ // Refernce: https://en.wikipedia.org/wiki/Graphics_Core_Next
  size_type nwarps_in_hip = exec->get_num_multiprocessor() * 4;
  size_type multiple = 8;
  if (nnz >= 2000000) {

diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
@@ -91,6 +91,10 @@ namespace {
  * @param c the output dense vector
  * @param c_stride the stride of the output dense vector
  * @param scale the function on the added value
+ * 
+ * @tparam ValueType type of values stored in the matrix
+ * @tparam IndexType type of matrix indexes stored in the structure
+ * @tparam Closure type of the function used to write the result
  */
 template <int subwarp_size = hip_config::warp_size, typename ValueType,
  typename IndexType, typename Closure>
@@ -188,6 +192,10 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv(
  * @param c the output dense vector
  * @param c_stride the stride of the output dense vector
  * @param scale the function on the added value
+ * 
+ * @tparam ValueType type of values stored in the matrix
+ * @tparam IndexType type of matrix indexes stored in the structure
+ * @tparam Closure type of the function used to write the result
  */
 template <typename ValueType, typename IndexType, typename Closure>
 __device__ void spmm_kernel(const size_type nnz, const size_type num_elems,
@@ -296,6 +304,7 @@ void spmv2(std::shared_ptr<const HipExecutor> exec,
  const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
  if (nwarps > 0) {
+ // TODO: b_ncols needs to be tuned.
  if (b_ncols < 4) {
  const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
  int num_lines = ceildiv(nnz, nwarps * hip_config::warp_size);
@@ -336,6 +345,7 @@ void advanced_spmv2(std::shared_ptr<const HipExecutor> exec,
  const auto b_ncols = b->get_size()[1];
 
  if (nwarps > 0) {
+ // TODO: b_ncols needs to be tuned.
  if (b_ncols < 4) {
  int num_lines = ceildiv(nnz, nwarps * hip_config::warp_size);
  const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
@@ -367,6 +377,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 namespace kernel {
 
+
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
  const IndexType *__restrict__ idxs, size_type num_nonzeros,
@@ -388,6 +399,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
  }
 }
 
+
 } // namespace kernel