Update Cooperative Groups How to

ROCm · May 10, 2024 · a4b794e · a4b794e
1 parent eb7138d
commit a4b794e
Showing 1 changed file with 275 additions and 4 deletions.
diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst
@@ -6,7 +6,278 @@
 Cooperative Groups in HIP
 *******************************************************************************
 
-The Cooperative Groups API introduces new APIs to launch, group, subdivide,
-synchronize and identify threads, as well as some predefined group-collective
-algorithms, but most importantly a matching threading model to think in terms
-of.
+The Cooperative Groups is an extension of the exsiting ROCm programming model, 
+to get a more flexible grouping mechanism for the Developers. This feature was
+introduced in CUDA 9 first, 
+
+The API accessable in the ``cooperative_groups`` namespace after the 
+``cooperative_groups.h`` is included. The header contains the following 
+elements:
+
+* Data types for representing groups
+* Operations to generate implicit groups defined;
+* Collectives for partitioning existing groups into new groups;
+* Operation to synchronize all threads within the group;
+* Operations to inspect the group properties;
+* Collectives that expose low-level, group-specific and often HW accelerated, operations.
+
+The code differenc to the original block model can be find in the following table. 
+
+.. list-table:: Cooperative Group Example
+ :header-rows: 1
+ :widths: 50,50
+
+ * - **Original Block**
+ - **Cooperative Groups**
+
+ * - .. code-block:: C++
+
+ __device__ int reduce_sum(int *shared, int val) {
+ 
+ // Thread ID
+ const unsigned int thread_id = threadIdx.x;
+
+ // Every iteration the number of active threads halves,
+ // until we processed all values
+ for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) {
+ // Store value in shared memroy with thread ID
+ shared[thread_id] = val;
+
+ // Synchronize all threads
+ __syncthreads();
+
+ // Active thread sum up
+ if(thread_id < i)
+ val += shared[thread_id + i];
+
+ // Synchronize all threads in the group
+ g.sync();
+ }
+
+ // ...
+ }
+
+ - .. code-block:: C++
+
+ __device__ int reduce_sum(thread_group g, int *shared, int val) {
+
+ // Thread ID
+ const unsigned int group_thread_id = g.thread_rank();
+
+ // Every iteration the number of active threads halves,
+ // until we processed all values
+ for(unsigned int i = g.size() / 2; i > 0; i /= 2) {
+ // Store value in shared memroy with thread ID
+ shared[group_thread_id] = val;
+
+ // Synchronize all threads in the group
+ g.sync();
+
+ // Active thread sum up
+ if(group_thread_id < i)
+ val += shared[group_thread_id + i];
+
+ // Synchronize all threads in the group
+ g.sync();
+ }
+
+ // ...
+ }
+
+ * - .. code-block:: C++
+
+ __global__ void sum_kernel(...) {
+ // ...
+
+ // Workspace array in shared memory
+ __shared__ unsigned int workspace[2048];
+
+ // ...
+
+
+ // Perform reduction
+ output = reduce_sum(workspace, input);
+
+ // ...
+ }
+
+ - .. code-block:: C++
+
+ __global__ void sum_kernel(...) {
+ // ...
+
+ // Workspace array in shared memory
+ __shared__ unsigned int workspace[2048];
+
+ // ...
+
+ thread_block thread_block_group = this_thread_block();
+ // Perform reduction
+ output = reduce_sum(thread_block_group, workspace, input);
+
+ // ...
+ }
+
+The kernel launch also different at Cooperative Groups case, which depends on the 
+cooperative group types. St grid groups with single GPU case the ``hipLaunchCooperativeKernel``
+has to be used.
+
+Group Types
+=============
+
+Thread Block Group
+--------------------
+
+Cluster Group
+---------------
+
+Grid Group
+------------
+
+Multi Grid Group
+------------------
+
+Thread Block Tile
+------------------
+
+Coalesced Groups
+------------------
+
+Synchronization
+=================
+
+
+Intra-Workgroup or Intra-Block Synchronization
+-----------------------------------------------
+
+Grid Synchronization
+---------------------
+
+Check the cooperative launch capabality on single AMD GPU:
+
+.. code-block:: C++
+
+ int device = 0;
+ int supports_coop_launch = 0;
+ // Check support
+ // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+ HIP_CHECK(hipGetDevice(&device));
+ HIP_CHECK(
+ hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device));
+ if(!supports_coop_launch)
+ {
+ std::cout << "Skipping, device " << device << " does not support cooperative groups"
+ << std::endl;
+ return 0;
+ }
+
+Launch the cooperative kernel on single GPU:
+
+.. code-block:: C++
+
+ void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+ // Launching kernel from host.
+ HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel<partition_size>,
+ dim3(num_blocks),
+ dim3(threads_per_block),
+ params,
+ 0,
+ hipStreamDefault));
+
+
+The device side synchronization over the single GPU:
+
+.. code-block:: C++
+
+ grid_group grid = this_grid();
+ grid.sync();
+
+Multi-GPU Synchronization
+-----------------------------
+
+Check the cooperative launch capabality over the multiple GPUs:
+
+.. code-block:: C++
+
+ #ifdef __HIP_PLATFORM_AMD__
+ int device = 0;
+ int supports_coop_launch = 0;
+ // Check support
+ // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+ for (int i = 0; i < numGPUs; i++) {
+ HIP_CHECK(hipGetDevice(&device));
+ HIP_CHECK(
+ hipDeviceGetAttribute(
+ &supports_coop_launch, 
+ hipDeviceAttributeCooperativeMultiDeviceLaunch, 
+ device));
+ if(!supports_coop_launch)
+ {
+ std::cout << "Skipping, device " << device << " does not support cooperative groups"
+ << std::endl;
+ return 0;
+ }
+ }
+ #endif
+
+Launch the cooperative kernel on single GPU:
+
+.. code-block:: C++
+
+ void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+ // Launching kernel from host.
+ HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel<partition_size>,
+ dim3(num_blocks),
+ dim3(threads_per_block),
+ params,
+ 0,
+ hipStreamDefault));
+
+The device side synchronization over the multiple GPU:
+
+.. code-block:: C++
+
+ multi_grid_group multi_grid = this_multi_grid();
+ multi_grid.sync();
+
+
+
+Missing CUDA features
+======================
+
+The following CUDA optional headers are not supported on HIP:
+
+.. code-block:: C++
+
+ // Optionally include for memcpy_async() collective
+ #include <cooperative_groups/memcpy_async.h>
+ // Optionally include for reduce() collective
+ #include <cooperative_groups/reduce.h>
+ // Optionally include for inclusive_scan() and exclusive_scan() collectives
+ #include <cooperative_groups/scan.h>
+
+The kernel
+
+.. list-table:: Missing Cooperative features
+
+ * - **Function**
+ - **Supported on Host**
+ - **Supported on Device**
+
+
+8.2.1. CUDA 12.2
+barrier_arrive and barrier_wait member functions were added for grid_group and thread_block. Description of the API is available here.
+
+8.2.2. CUDA 12.1
+invoke_one and invoke_one_broadcast APIs were added.
+
+8.2.3. CUDA 12.0
+The following experimental APIs are now moved to the main namespace:
+
+asynchronous reduce and scan update added in CUDA 11.7
+
+thread_block_tile larger than 32 added in CUDA 11.1
+
+It is no longer required to provide memory using the block_tile_memory object in order to create these large tiles on Compute Capability 8.0 or higher.
+
+8.3. Programming Model Concept