diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ac68ce076c..f8a03b71dc 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -24,7 +24,7 @@ build: post_checkout: - if [ -d ../clr ]; then rm -rf ../clr; fi - if [ -d ../ROCR-Runtime ]; then rm -rf ../ROCR-Runtime; fi - - git clone --depth=1 --single-branch --branch docs/develop https://github.com/ROCm/clr.git ../clr + - git clone --depth=1 --single-branch --branch cooperative_groups_documentation https://github.com/ROCm/clr.git ../clr - git clone --depth=1 --single-branch --branch master https://github.com/ROCm/ROCR-Runtime.git ../ROCR-Runtime post_build: - rm -rf ../clr diff --git a/.wordlist.txt b/.wordlist.txt index 17ece19b5c..8b77cfcb65 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -18,6 +18,7 @@ cuCtx cuDNN dataflow deallocate +decompositions denormal dll DirectX @@ -103,6 +104,7 @@ structs SYCL syntaxes tradeoffs +templated typedefs UMM WinGDB diff --git a/docs/conf.py b/docs/conf.py index 8981813248..4ac5cc3208 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,3 +46,10 @@ suppress_warnings = ["etoc.toctree"] numfig = False + + +exclude_patterns = [ + "doxygen/mainpage.md", + "understand/glossary.md", + "understand/thread_hierarchy_coop_figure.rst" +] \ No newline at end of file diff --git a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop.drawio similarity index 98% rename from docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio rename to docs/data/how-to/cooperative_groups/thread_hierarchy_coop.drawio index fb4c19fef9..e4c0c90d2d 100644 --- a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio +++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop.drawio @@ -1,6 +1,6 @@ - + @@ -1411,7 +1411,7 @@ - + @@ -1591,7 +1591,7 @@ - + @@ -1762,7 +1762,7 @@ - + @@ -1876,7 +1876,7 @@ - + @@ -2047,7 +2047,7 @@ - + @@ -3490,7 +3490,7 @@ - + @@ -3670,7 +3670,7 @@ - + @@ -3841,7 +3841,7 @@ - + @@ -3955,7 +3955,7 @@ - + @@ -4126,7 +4126,7 @@ - + @@ -4534,7 +4534,7 @@ - + @@ -4600,7 +4600,7 @@ - + @@ -4771,7 +4771,7 @@ - + @@ -4933,7 +4933,7 @@ - + @@ -4984,163 +4984,163 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop.svg b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop.svg new file mode 100644 index 0000000000..199f92306a --- /dev/null +++ b/docs/data/how-to/cooperative_groups/thread_hierarchy_coop.svg @@ -0,0 +1 @@ +Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
GridMulti Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg b/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg deleted file mode 100644 index a3f57994fb..0000000000 --- a/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg +++ /dev/null @@ -1 +0,0 @@ -Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Cluster
Cluster
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Cluster
Cluster
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
GridMulti Grid
Cluster
Cluster
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Block
Block
Block
Block
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 023fe7bfc5..aa71441160 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -833,6 +833,8 @@ INPUT = mainpage.md \ ../../include/hip \ ../../../clr/hipamd/include/hip/amd_detail/amd_hip_gl_interop.h \ ../../../clr/hipamd/include/hip/amd_detail/amd_surface_functions.h \ + ../../../clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h \ + ../../../llvm-project/clang/lib/Headers/__clang_hip_math.h \ ../../../ROCR-Runtime/src/inc/hsa_ext_amd.h # This tag can be used to specify the character encoding of the source files @@ -2196,6 +2198,8 @@ INCLUDE_FILE_PATTERNS = # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. PREDEFINED = "__HIP_PLATFORM_AMD__" \ + "DOXYGEN_SHOULD_INCLUDE_THIS=1" \ + "DOXYGEN_SHOULD_SKIP_THIS=1" \ "__dparm(x)=" \ "__cplusplus=201103L" \ "__host__=" \ @@ -2203,7 +2207,8 @@ PREDEFINED = "__HIP_PLATFORM_AMD__" \ "__hip_img_chk__=" \ "__CG_QUALIFIER__=" \ "__CG_STATIC_QUALIFIER__=static" \ - "_CG_STATIC_CONST_DECL_=static constexpr" + "_CG_STATIC_CONST_DECL_=static constexpr" \ + "HIP_ENABLE_WARP_SYNC_BUILTINS" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst new file mode 100644 index 0000000000..14791b11fb --- /dev/null +++ b/docs/how-to/cooperative_groups.rst @@ -0,0 +1,468 @@ +.. meta:: + :description: This topic describes how to use cooperative groups in HIP + :keywords: AMD, ROCm, HIP, cooperative groups + +.. _cooperative_groups_how-to: + +******************************************************************************* +Cooperative Groups +******************************************************************************* + +Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. The API enables the developers to specify the level of granularity for thread communication which can lead to more efficient parallel decompositions. + +.. Maybe this sentence is better: The rich set of APIs introduced by Cooperative Groups allow the programmer to define their own set of thread groups which may fit their user-cases better than those defined by the hardware. + +The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements: + +* Static functions to create groups and subgroups. +* Hardware-accelerated operations over the whole group, like shuffles. +* Data types of cooperative groups. +* Synchronize member function of the groups. +* Get group properties member functions. + +.. note:: + + This feature was introduced with ROCm 4.1 at AMD platform and CUDA 9.0 at NVIDIA platform. This introduced a new level between block and thread block level for synchronization. + +Cooperative groups thread model +=============================== + +The thread hierarchy abstraction of cooperative groups is in +:numref:`coop_thread_hierarchy`. + +.. _coop_thread_hierarchy: + +.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop.svg + :alt: Diagram depicting nested rectangles of varying color. The outermost one + titled "Grid", inside sets of different sized rectangles layered on + one another titled "Block". Each "Block" containing sets of uniform + rectangles layered on one another titled "Warp". Each of the "Warp" + titled rectangles filled with downward pointing arrows inside. + + Cooperative group thread hierarchy. + +The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid. The ability to synchronize over a grid requires the kernel to be launched using the cooperative groups API. The **block** is the same as the :ref:`inherent_thread_model` block entity. + +.. note:: + + Explicit warp-level thread handling is absent from the Cooperative Groups API. In order to exploit the known hardware SIMD width on which built-in functionality translates to simpler logic, you can use the group partitioning part of the API, such as ``tiled_partition``. + +For details on memory model, check the :ref:`memory model description `. + +Cooperative groups simple example +================================= + +The difference to the original block model in the ``reduce_sum`` device function is the following. + +.. tab-set:: + .. tab-item:: Original Block + :sync: original-block + + .. code-block:: cuda + + __device__ int reduce_sum(int *shared, int val) { + + // Thread ID + const unsigned int thread_id = threadIdx.x; + + // Every iteration the number of active threads + // halves, until we processed all values + for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) { + // Store value in shared memory with thread ID + shared[thread_id] = val; + + // Synchronize all threads + __syncthreads(); + + // Active thread sum up + if(thread_id < i) + val += shared[thread_id + i]; + + // Synchronize all threads in the group + __syncthreads(); + } + + // ... + } + + .. tab-item:: Cooperative Groups + :sync: cooperative-groups + + .. code-block:: cuda + + __device__ int reduce_sum(thread_group g, + int *shared, + int val) { + + // Thread ID + const unsigned int group_thread_id = g.thread_rank(); + + // Every iteration the number of active threads + // halves, until we processed all values + for(unsigned int i = g.size() / 2; i > 0; i /= 2) { + // Store value in shared memroy with thread ID + shared[group_thread_id] = val; + + // Synchronize all threads in the group + g.sync(); + + // Active thread sum up + if(group_thread_id < i) + val += shared[group_thread_id + i]; + + // Synchronize all threads in the group + g.sync(); + } + + // ... + } + +The ``reduce_sum()`` function call and input data initialization difference to the original block model is the following. + +.. tab-set:: + .. tab-item:: Original Block + :sync: original-block + + .. code-block:: cuda + + __global__ void sum_kernel(...) { + + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + // Perform reduction + output = reduce_sum(workspace, input); + + // ... + } + + .. tab-item:: Cooperative Groups + :sync: cooperative-groups + + .. code-block:: cuda + + __global__ void sum_kernel(...) { + + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + // Initialize the thread_block + thread_block thread_block_group = this_thread_block(); + // Perform reduction + output = reduce_sum(thread_block_group, workspace, input); + + // ... + } + +At the device function, the input group type is the ``thread_group``, which is the parent class of all the cooperative groups type. With this, you can write generic functions, which can work with any type of cooperative groups. + +Group types +=========== + +Group types are based on the levels of synchronization and data sharing among threads. + +Thread-block group +------------------ + +Represents an intra-block cooperative groups type where the participating threads within the group are the same threads that participated in the currently executing ``block``. + +.. code-block:: cpp + + class thread_block; + +Constructed via: + +.. code-block:: cpp + + thread_block g = this_thread_block(); + +The ``group_index()`` , ``thread_index()`` , ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` , ``sync()`` and ``group_dim()`` member functions are public of the thread_block class. For further details, check the :ref:`thread_block references ` . + +Grid group +------------ + +Represents an inter-block cooperative groups type where the group's participating threads span multiple blocks running the same kernel on the same device. Use the cooperative launch API to enable synchronization across the grid group. + +.. code-block:: cpp + + class grid_group; + +Constructed via: + +.. code-block:: cpp + + grid_group g = this_grid(); + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` and ``sync()`` member functions +are public of the ``grid_group`` class. For further details, check the :ref:`grid_group references `. + +Multi-grid group +------------------ + +Represents an inter-device cooperative groups type where the participating threads within the group span multiple devices that run the same kernel on the devices. All the multi-grid group APIs require that you have used the appropriate launch API. + +.. code-block:: cpp + + class multi_grid_group; + +Constructed via: + +.. code-block:: cpp + + // Kernel must be launched with the cooperative multi-device API + multi_grid_group g = this_multi_grid(); + +The ``num_grids()`` , ``grid_rank()`` , ``thread_rank()``, ``size()``, ``cg_type()``, ``is_valid()`` , +and ``sync()`` member functions are public of the ``multi_grid_group`` class. For +further details check the :ref:`multi_grid_group references ` . + +Thread-block tile +------------------ + +This constructs a templated class derived from ``thread_group``. The template defines the tile +size of the new thread group at compile time. This group type also supports sub-wave level intrinsics. + +.. code-block:: cpp + + template + class thread_block_tile; + +Constructed via: + +.. code-block:: cpp + + template + _CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) + + +.. note:: + + * ``Size`` must be a power of 2 and not larger than warp (wavefront) size. + * ``shfl`` functions support integer or float type. + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, ``shfl_up(...)`` and ``shfl_xor(...)`` member functions are public of the ``thread_block_tile`` class. For further details, check the :ref:`thread_block_tile references ` . + +Coalesced groups +------------------ + +Threads (64 threads on CDNA and 32 threads on RDNA or NVIDIA GPUs) in a warp cannot execute different instructions simultaneously, so conditional branches are executed serially within the warp. When threads encounter a conditional branch, they can diverge, resulting in some threads being disabled if they do not meet the condition to execute that branch. The active threads referred as coalesced and coalesced groups represents an active thread group within a warp. + +This group type also supports sub-wave level intrinsics. + +.. code-block:: cpp + + class coalesced_group; + +Constructed via: + +.. code-block:: cpp + + coalesced_group active = coalesced_threads(); + +.. note:: + + * ``shfl`` functions support integer or float type. + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, and ``shfl_up(...)`` member functions are public of the ``coalesced_group`` class. For more information, see :ref:`coalesced_group references ` . + +.. _coop_synchronization: + +Synchronization +=============== + +With each group type, the synchronization requires using the correct cooperative groups launch API. + +**Check the kernel launch capability** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + Do not need kernel launch validation. + + .. tab-item:: Grid + :sync: grid + + Confirm the cooperative launch capability on the single AMD GPU: + + .. code-block:: cpp + + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + + .. tab-item:: Multi-grid + :sync: multi-grid + + Confirm the cooperative launch capability over multiple GPUs: + + .. code-block:: cpp + + // Check support of cooperative groups + std::vector deviceIDs; + for(int deviceID = 0; deviceID < device_count; deviceID++) { + #ifdef __HIP_PLATFORM_AMD__ + int supports_coop_launch = 0; + HIP_CHECK( + hipDeviceGetAttribute( + &supports_coop_launch, + hipDeviceAttributeCooperativeMultiDeviceLaunch, + deviceID)); + if(!supports_coop_launch) { + std::cout << "Skipping, device " << deviceID << " does not support cooperative groups" + << std::endl; + } + else + #endif + { + std::cout << deviceID << std::endl; + // Collect valid deviceIDs. + deviceIDs.push_back(deviceID); + } + } + +**Kernel launch** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + You can access the new block representation using the original kernel launch methods. + + .. code-block:: cpp + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchKernelGGL(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + 0, + hipStreamDefault, + &d_vector, + &d_block_reduced, + &d_partition_reduced)); + + .. tab-item:: Grid + :sync: grid + + Launch the cooperative kernel on a single GPU: + + .. code-block:: cpp + + void* params[] = {}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + 0, + 0, + hipStreamDefault)); + + .. tab-item:: Multi-grid + :sync: multi-grid + + Launch the cooperative kernel over multiple GPUs: + + .. code-block:: cpp + + hipLaunchParams *launchParamsList = (hipLaunchParams*)malloc(sizeof(hipLaunchParams) * deviceIDs.size()); + for(int deviceID : deviceIDs) { + + // Set device + HIP_CHECK(hipSetDevice(deviceID)); + + // Create stream + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Parameters + void* params[] = {&(d_vector[deviceID]), &(d_block_reduced[deviceID]), &(d_partition_reduced[deviceID])}; + + // Set launchParams + launchParamsList[deviceID].func = (void*)vector_reduce_kernel; + launchParamsList[deviceID].gridDim = dim3(1); + launchParamsList[deviceID].blockDim = dim3(threads_per_block); + launchParamsList[deviceID].sharedMem = 0; + launchParamsList[deviceID].stream = stream; + launchParamsList[deviceID].args = params; + } + + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, + (int)deviceIDs.size(), + hipCooperativeLaunchMultiDeviceNoPreSync)); + +**Device side synchronization** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + The device side code of the thread_block synchronization over single GPUs: + + .. code-block:: cpp + + thread_block g = this_thread_block(); + g.sync(); + + .. tab-item:: Grid + :sync: grid + + The device side code of the grid synchronization over single GPUs: + + .. code-block:: cpp + + grid_group grid = this_grid(); + grid.sync(); + + .. tab-item:: Multi-grid + :sync: multi-grid + + The device side code of the multi-grid synchronization over multiple GPUs: + + .. code-block:: cpp + + multi_grid_group multi_grid = this_multi_grid(); + multi_grid.sync(); + +Unsupported NVIDIA CUDA features +================================ + +HIP doesn't support the following NVIDIA CUDA optional headers: + +* ``cooperative_groups/memcpy_async.h`` +* ``cooperative_groups/reduce.h`` +* ``cooperative_groups/scan.h`` + +HIP doesn't support the following CUDA class in ``cooperative_groups`` namespace: + +* ``cluster_group`` + +HIP doesn't support the following CUDA functions/operators in ``cooperative_groups`` namespace: + +* ``synchronize`` +* ``memcpy_async`` +* ``wait`` and ``wait_prior`` +* ``barrier_arrive`` and ``barrier_wait`` +* ``invoke_one`` and ``invoke_one_broadcast`` +* ``reduce`` +* ``reduce_update_async`` and ``reduce_store_async`` +* Reduce operators ``plus`` , ``less`` , ``greater`` , ``bit_and`` , ``bit_xor`` and ``bit_or`` +* ``inclusive_scan`` and ``exclusive_scan`` diff --git a/docs/index.md b/docs/index.md index 876a8ec42e..7120adcfaa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,6 +46,7 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support * [Debugging with HIP](./how-to/debugging) * {doc}`./how-to/logging` * [Unified Memory](./how-to/unified_memory) +* [Cooperative Groups](./how-to/cooperative_groups) * {doc}`./how-to/faq` ::: @@ -58,6 +59,7 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support * [Comparing Syntax for Different APIs](./reference/terms) * [HSA Runtime API for ROCm](./reference/virtual_rocr) * [HIP Managed Memory Allocation API](./reference/unified_memory_reference) +* [HIP Cooperative Groups API](./reference/cooperative_groups) * [List of deprecated APIs](./reference/deprecated_api_list) ::: @@ -69,6 +71,7 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support * [HIP test samples](https://github.com/ROCm/hip-tests/tree/develop/samples) * [SAXPY tutorial](./tutorial/saxpy) * [Reduction tutorial](./tutorial/reduction) +* [Cooperative groups tutorial](./tutorial/cooperative_groups_tutorial) ::: diff --git a/docs/reference/cooperative_groups_reference.rst b/docs/reference/cooperative_groups_reference.rst new file mode 100644 index 0000000000..f044b69219 --- /dev/null +++ b/docs/reference/cooperative_groups_reference.rst @@ -0,0 +1,94 @@ +.. meta:: + :description: This chapter lists types and device API wrappers related to the Cooperative Group + feature. Programmers can directly use them in their kernels to make use of this feature. + :keywords: AMD, ROCm, HIP, cooperative groups + +.. _cooperative_groups_reference: + +******************************************************************************* +HIP Cooperative Groups API +******************************************************************************* + +Cooperative kernel launches +=========================== + +The following host side functions used for cooperative kernel launches. + +.. doxygenfunction:: hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) + +.. doxygenfunction:: hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) + +.. doxygenfunction:: hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags) + +.. doxygenfunction:: hipModuleLaunchCooperativeKernel + +.. doxygenfunction:: hipModuleLaunchCooperativeKernelMultiDevice + +Cooperative groups classes +========================== + +The following cooperative groups classes can be used on the device side. + +.. _thread_group_ref: + +.. doxygenclass:: cooperative_groups::thread_group + :members: + +.. _thread_block_ref: + +.. doxygenclass:: cooperative_groups::thread_block + :members: + +.. _grid_group_ref: + +.. doxygenclass:: cooperative_groups::grid_group + :members: + +.. _multi_grid_group_ref: + +.. doxygenclass:: cooperative_groups::multi_grid_group + :members: + +.. _thread_block_tile_ref: + +.. doxygenclass:: cooperative_groups::thread_block_tile + :members: + +.. _coalesced_group_ref: + +.. doxygenclass:: cooperative_groups::coalesced_group + :members: + +Cooperative groups construct functions +====================================== + +The following functions are used to construct different group types instances on the device side. + +.. doxygenfunction:: cooperative_groups::this_multi_grid + +.. doxygenfunction:: cooperative_groups::this_grid + +.. doxygenfunction:: cooperative_groups::this_thread_block + +.. doxygenfunction:: cooperative_groups::coalesced_threads + +.. doxygenfunction:: cooperative_groups::tiled_partition(const ParentCGTy &g) + +.. doxygenfunction:: cooperative_groups::tiled_partition(const thread_group &parent, unsigned int tile_size) + +.. doxygenfunction:: cooperative_groups::binary_partition(const coalesced_group& cgrp, bool pred) + +.. doxygenfunction:: cooperative_groups::binary_partition(const thread_block_tile& tgrp, bool pred) + +Cooperative groups exposed API functions +======================================== + +The following functions are the exposed API for different group types instances on the device side. + +.. doxygenfunction:: cooperative_groups::group_size + +.. doxygenfunction:: cooperative_groups::thread_rank + +.. doxygenfunction:: cooperative_groups::is_valid + +.. doxygenfunction:: cooperative_groups::sync \ No newline at end of file diff --git a/docs/reference/cpp_language_extensions.rst b/docs/reference/cpp_language_extensions.rst index cbbcfd1c43..e4bc3782ac 100644 --- a/docs/reference/cpp_language_extensions.rst +++ b/docs/reference/cpp_language_extensions.rst @@ -292,8 +292,7 @@ dimensions to 1. Memory fence instructions ==================================================== -HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in -the HIP-Clang path, you can use the following workaround: +HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in the HIP-Clang path, you can use the following workaround: #. Build HIP with the ``HIP_COHERENT_HOST_ALLOC`` environment variable enabled. #. Modify kernels that use ``__threadfence_system()`` as follows: @@ -306,9 +305,14 @@ the HIP-Clang path, you can use the following workaround: Synchronization functions ==================================================== + +Synchronization functions causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group. + The ``__syncthreads()`` built-in function is supported in HIP. The ``__syncthreads_count(int)``, ``__syncthreads_and(int)``, and ``__syncthreads_or(int)`` functions are under development. +The Cooperative Groups API offer options to do synchronization on a developer defined set of thread groups. For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `. + Math functions ==================================================== @@ -972,6 +976,8 @@ HIP supports the following kernel language cooperative groups types and function - ✓ - ✓ +For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `. + Warp matrix functions ============================================================ diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 335e2a96ea..d7a9278bc2 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -29,6 +29,7 @@ subtrees: - file: how-to/performance_guidelines - file: how-to/debugging - file: how-to/logging + - file: how-to/cooperative_groups - file: how-to/unified_memory title: Unified Memory - file: how-to/faq @@ -41,7 +42,10 @@ subtrees: - file: reference/math_api - file: reference/terms title: Comparing Syntax for different APIs + - file: reference/cooperative_groups_reference + title: HIP Cooperative Groups API - file: reference/virtual_rocr + title: HSA Runtime API for ROCm - file: reference/unified_memory_reference title: HIP Managed Memory Allocation API - file: reference/deprecated_api_list @@ -57,6 +61,7 @@ subtrees: title: HIP test samples - file: tutorial/saxpy - file: tutorial/reduction + - file: tutorial/cooperative_groups_tutorial - caption: About entries: diff --git a/docs/tutorial/cooperative_groups_tutorial.rst b/docs/tutorial/cooperative_groups_tutorial.rst new file mode 100644 index 0000000000..0fd018b546 --- /dev/null +++ b/docs/tutorial/cooperative_groups_tutorial.rst @@ -0,0 +1,240 @@ +.. meta:: + :description: HIP cooperative groups tutorial + :keywords: AMD, ROCm, HIP, cooperative groups, tutorial + +******************************************************************************* +Cooperative Groups +******************************************************************************* + +This tutorial demonstrates the basic concepts of cooperative groups in the HIP (Heterogeneous-computing Interface for Portability) programming model and the most essential tooling supporting it. This topic also reviews the commonalities of heterogeneous APIs. Familiarity with the C/C++ compilation model and the language is assumed. + +Prerequisites +============= + +To follow this tutorial, you'll need properly installed drivers and a HIP compiler toolchain to compile your code. Because ROCm HIP supports compiling and running on Linux and Microsoft Windows with AMD and NVIDIA GPUs, review the HIP development package installation before starting this tutorial. For more information, see :doc:`/install/install`. + +Simple HIP Code +=============== + +To become familiar with heterogeneous programming, review the :doc:`SAXPY tutorial ` and the first HIP code subsection. Compiling is also described in that tutorial. + +Tiled partition +=============== + +You can use tiled partition to calculate the sum of ``partition_size`` length sequences and the sum of ``result_size``/ ``BlockSize`` length sequences. The host-side reference implementation is the following: + +.. code-block:: cpp + + // Host-side function to perform the same reductions as executed on the GPU + std::vector ref_reduced(const unsigned int partition_size, + std::vector input) + { + const unsigned int input_size = input.size(); + const unsigned int result_size = input_size / partition_size; + std::vector result(result_size); + + for(unsigned int i = 0; i < result_size; i++) + { + unsigned int partition_result = 0; + for(unsigned int j = 0; j < partition_size; j++) + { + partition_result += input[partition_size * i + j]; + } + result[i] = partition_result; + } + + return result; + } + +Device side code +---------------- + +To calculate the sum of the sets of numbers, the tutorial uses the shared memory-based reduction on the device side. The warp level intrinsics usage is not covered in this tutorial, unlike in the :doc:`reduction tutorial. ` The x input variable is a shared pointer, which needs to be synchronized after every value changes. The ``thread_group`` input parameter can be ``thread_block_tile`` or ``thread_block`` because the ``thread_group`` is the parent class of these types. The ``val`` are the numbers to calculate the sum of. The returned results of this function return the final results of the reduction on thread ID 0 of the ``thread_group``, and for every other thread, the function results are 0. + +.. code-block:: cuda + + /// \brief Summation of `unsigned int val`'s in `thread_group g` using shared memory `x` + __device__ unsigned int reduce_sum(thread_group g, unsigned int* x, unsigned int val) + { + // Rank of this thread in the group + const unsigned int group_thread_id = g.thread_rank(); + + // We start with half the group size as active threads + // Every iteration the number of active threads halves, until we processed all values + for(unsigned int i = g.size() / 2; i > 0; i /= 2) + { + // Store value for this thread in a shared, temporary array + x[group_thread_id] = val; + + // Synchronize all threads in the group + g.sync(); + + // If our thread is still active, sum with its counterpart in the other half + if(group_thread_id < i) + { + val += x[group_thread_id + i]; + } + + // Synchronize all threads in the group + g.sync(); + } + + // Only the first thread returns a valid value + if(g.thread_rank() == 0) + return val; + else + return 0; + } + +The ``reduce_sum`` device function is reused to calculate the block and custom +partition sum of the input numbers. The kernel has three sections: + +1. Initialization of the reduction function variables. +2. The reduction of thread block and store the results in global memory. +3. The reduction of custom partition and store the results in global memory. + +1. Initialization of the reduction function variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the shared memory is declared, the thread_block_group and +custom_partition are defined, and the input variables are loaded from global +memory. + +.. code-block:: cuda + + // threadBlockGroup consists of all threads in the block + thread_block thread_block_group = this_thread_block(); + + // Workspace array in shared memory required for reduction + __shared__ unsigned int workspace[2048]; + + unsigned int output; + + // Input to reduce + const unsigned int input = d_vector[thread_block_group.thread_rank()]; + + // ... + + // Every custom_partition group consists of 16 threads + thread_block_tile custom_partition + = tiled_partition(thread_block_group); + + +2. The reduction of thread block +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the sum is calculated on ``thread_block_group`` level, then the results are stored in global memory. + +.. code-block:: cuda + + // Perform reduction + output = reduce_sum(thread_block_group, workspace, input); + + // Only the first thread returns a valid value + if(thread_block_group.thread_rank() == 0) + { + d_block_reduced_vector[0] = output; + } + +3. The reduction of custom partition +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the sum is calculated on the custom partition level, then the results are stored in global memory. The custom partition is a partial block of the thread block, it means the reduction calculate on a shorter sequence of input numbers than at the ``thread_block_group`` case. + +.. code-block:: cuda + + // Perform reduction + output = reduce_sum(custom_partition, &workspace[group_offset], input); + + // Only the first thread in each partition returns a valid value + if(custom_partition.thread_rank() == 0) + { + const unsigned int partition_id = thread_block_group.thread_rank() / PartitionSize; + d_partition_reduced_vector[partition_id] = output; + } + +Host-side code +-------------- + +On the host-side, the following steps are done in the example: + +1. Confirm the cooperative group support on AMD GPUs. +2. Initialize the cooperative group configuration. +3. Allocate and copy input to global memory. +4. Launch the cooperative kernel. +5. Save the results from global memory. +6. Free the global memory. + +Only the first, second and fourth steps are important from the cooperative groups aspect, that's why those steps are detailed further. + +1. Confirm the cooperative group support on AMD GPUs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Not all AMD GPUs support cooperative groups. You can confirm support with the following code: + +.. code-block:: cpp + + #ifdef __HIP_PLATFORM_AMD__ + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + #endif + +2. Initialize the cooperative group configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the example, there is only one block in the grid, and the ``threads_per_block`` must be dividable with ``partition_size``. + +.. code-block:: cpp + + // Number of blocks to launch. + constexpr unsigned int num_blocks = 1; + + // Number of threads in each kernel block. + constexpr unsigned int threads_per_block = 64; + + // Total element count of the input vector. + constexpr unsigned int size = num_blocks * threads_per_block; + + // Total elements count of a tiled_partition. + constexpr unsigned int partition_size = 16; + + // Total size (in bytes) of the input vector. + constexpr size_t size_bytes = sizeof(unsigned int) * size; + + static_assert(threads_per_block % partition_size == 0, + "threads_per_block must be a multiple of partition_size"); + +4. Launch the kernel +~~~~~~~~~~~~~~~~~~~~ + +The kernel launch is done with the ``hipLaunchCooperativeKernel`` of the cooperative groups API. + +.. code-block:: cpp + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + params, + 0, + hipStreamDefault));\ + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + +Conclusion +========== + +With cooperative groups, you can easily use custom partitions to create custom tiles for custom solutions. You can find the complete code at `cooperative groups ROCm example. `_ diff --git a/docs/understand/programming_model_reference.rst b/docs/understand/programming_model_reference.rst index 6948d05c48..1fe9a44647 100644 --- a/docs/understand/programming_model_reference.rst +++ b/docs/understand/programming_model_reference.rst @@ -87,47 +87,13 @@ of. It relaxes some restrictions of the :ref:`inherent_thread_model` imposed by the strict 1:1 mapping of architectural details to the programming model. -The rich set of APIs introduced by Cooperative Groups allow the programmer to -define their own set of thread groups which may fit their user-cases better than -those defined by the hardware. The set of implicit groups by kernel launch -parameters are still available. - -The thread hierarchy abstraction of Cooperative Groups manifest as depicted in -:numref:`coop_thread_hierarchy`. - -.. _coop_thread_hierarchy: - -.. figure:: ../data/understand/programming_model_reference/thread_hierarchy_coop.svg - :alt: Diagram depicting nested rectangles of varying color. The outermost one - titled "Grid", inside sets of different sized rectangles layered on - one another titled "Block". Each "Block" containing sets of uniform - rectangles layered on one another titled "Warp". Each of the "Warp" - titled rectangles filled with downward pointing arrows inside. - - Cooperative group thread hierarchy. - -Multi Grid - An abstraction of potentially multiple simultaneous launches of - the same kernel over multiple devices. Grids inside a multi device kernel - launch need not be of uniform size, thus allowing taking into account - different device capabilities and preferences. - - .. deprecated:: 5.0 - -Grid - Same as the :ref:`inherent_thread_model` Grid entity. The ability to - synchronize over a grid requires the kernel to be launched using the - Cooperative Groups API. - -Block - Same as the :ref:`inherent_thread_model` Block entity. +The Cooperative Groups API lets you define your own thread groups which may fit your use-case better than those defined by the default thread model. .. note:: - Explicit warp-level thread handling is absent from the Cooperative Groups API. - In order to exploit the known hardware SIMD width on which built-in - functionality translates to simpler logic, one may use the group partitioning - part of the API, such as ``tiled_partition``. + The default thread groups defined by kernel launch parameters are still available. See the :ref:inherent thread model for more information. + +For further information, check the :ref:`inherent thread model `. Memory Model ============ diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index c5a81dcccc..41f2cdaaec 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -1061,8 +1061,8 @@ typedef struct dim3 { */ typedef struct hipLaunchParams_t { void* func; ///< Device function symbol - dim3 gridDim; ///< Grid dimentions - dim3 blockDim; ///< Block dimentions + dim3 gridDim; ///< Grid dimensions + dim3 blockDim; ///< Block dimensions void **args; ///< Arguments size_t sharedMem; ///< Shared memory hipStream_t stream; ///< Stream identifier @@ -5582,7 +5582,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne * @param [in] kernelParams A list of kernel arguments. * * Please note, HIP does not support kernel launch with total work items defined in dimension with - * size gridDim x blockDim >= 2^32. + * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$. * * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue, @@ -5612,8 +5612,8 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* unsigned int numDevices, unsigned int flags); /** - * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed - * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute + * @brief Launches kernel f with launch parameters and shared memory on stream with arguments passed + * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute. * * @param [in] f Kernel to launch. * @param [in] gridDim Grid dimensions specified as multiple of blockDim. @@ -5625,7 +5625,7 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* * default stream is used with associated synchronization rules. * * Please note, HIP does not support kernel launch with total work items defined in dimension with - * size gridDim x blockDim >= 2^32. + * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$. * * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue, #hipErrorCooperativeLaunchTooLarge */ @@ -8638,7 +8638,7 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject); #ifdef __cplusplus #if defined(__clang__) && defined(__HIP__) template -static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, +static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,template T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) { return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast(f),dynSharedMemPerBlk,blockSizeLimit); } @@ -8933,12 +8933,16 @@ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kern * * @ingroup Execution * - * @param [in] f device function symbol - * @param [in] gridDim grid dimentions - * @param [in] blockDim block dimentions - * @param [in] kernelParams kernel parameters - * @param [in] sharedMemBytes shared memory in bytes - * @param [in] stream stream on which kernel launched + * \tparam T The type of the kernel function. + * + * @param [in] f Kernel function to launch. + * @param [in] gridDim Grid dimensions specified as multiple of blockDim. + * @param [in] blockDim Block dimensions specified in work-items. + * @param [in] kernelParams A list of kernel arguments. + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for + * this kernel. The HIP-Clang compiler provides + * support for extern shared declarations. + * @param [in] stream Stream which on the kernel launched. * * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue, * #hipErrorInvalidResourceHandle @@ -8951,14 +8955,14 @@ inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, blockDim, kernelParams, sharedMemBytes, stream); } /** - * @brief Launches device function on multiple devices where thread blocks can cooperate and - * synchronize on execution. + * @brief Launches kernel function on multiple devices, where thread blocks can + * cooperate and synchronize on execution. * * @ingroup Execution * - * @param [in] launchParamsList list of kernel launch parameters, one per device - * @param [in] numDevices size of launchParamsList array - * @param [in] flags flag to handle launch behavior + * @param [in] launchParamsList List of kernel launch parameters, one per device. + * @param [in] numDevices Size of launchParamsList array. + * @param [in] flags Flag to handle launch behavior. * * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue, * #hipErrorInvalidResourceHandle