Whitespace and grammar fix.

ROCm · Jun 20, 2024 · 204a3e6 · 204a3e6
1 parent 94cf72a
commit 204a3e6
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 143 deletions.
diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst
@@ -6,16 +6,15 @@
 Cooperative Groups in HIP
 *******************************************************************************
 
-Cooperative Groups is an extension to the ROCm programming model. It provides developers with a flexible grouping mechanism. This feature was introduced with AMD ROCm 4.1 and NVIDIA CUDA.
+Cooperative Groups is an extension to the ROCm programming model. It provides developers with a flexible grouping mechanism. This feature was introduced with ROCm 4.1 at AMD platform and CUDA 9.0 at NVIDIA platform.
 
 The API is accessible in the ``cooperative_groups`` namespace after the ``cooperative_groups.h`` is included. The header contains the following elements:
 
-* Data types for representing groups
-* Operations to generate implicit groups defined
-* Collectives for partitioning existing groups into new groups
-* Operation to synchronize all threads within the group
-* Operations to inspect the group properties
-* Collectives that expose low-level, group-specific, and, often, hardware-accelerated operations.
+* Data types of cooperative groups.
+* Synchronize member function of the groups.
+* Get group properties member functions.
+* Hardware-accelerated operations over the whole group, like shuffles.
+* Static functions to create groups and subgroups.
 
 Cooperative Group Simple Example
 ================================
@@ -24,127 +23,126 @@ You'll find the code difference to the original block model in the following exa
 
 **Original Block**
 
-.. code-block:: cpp
- 
+.. code-block:: cuda
+
  __device__ int reduce_sum(int *shared, int val) {
- 
- // Thread ID
- const unsigned int thread_id = threadIdx.x;
 
- // Every iteration the number of active threads
- // halves, until we processed all values
- for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) {
- // Store value in shared memory with thread ID
- shared[thread_id] = val;
+ // Thread ID
+ const unsigned int thread_id = threadIdx.x;
 
- // Synchronize all threads
- __syncthreads();
+ // Every iteration the number of active threads
+ // halves, until we processed all values
+ for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) {
+ // Store value in shared memory with thread ID
+ shared[thread_id] = val;
 
- // Active thread sum up
- if(thread_id < i)
- val += shared[thread_id + i];
+ // Synchronize all threads
+ __syncthreads();
 
- // Synchronize all threads in the group
- g.sync();
- }
+ // Active thread sum up
+ if(thread_id < i)
+ val += shared[thread_id + i];
+
+ // Synchronize all threads in the group
+ g.sync();
+ }
 
- // ...
+  // ...
  }
 
 **Cooperative Groups**
 
-.. code-block:: cpp
+.. code-block:: cuda
 
- __device__ int reduce_sum(thread_group g, 
- int *shared, 
+ __device__ int reduce_sum(thread_group g,
+ int *shared,
  int val) {
 
- // Thread ID
- const unsigned int group_thread_id = g.thread_rank();
+  // Thread ID
+  const unsigned int group_thread_id = g.thread_rank();
 
- // Every iteration the number of active threads
- // halves, until we processed all values
- for(unsigned int i = g.size() / 2; i > 0; i /= 2) {
- // Store value in shared memroy with thread ID
- shared[group_thread_id] = val;
+  // Every iteration the number of active threads
+  // halves, until we processed all values
+  for(unsigned int i = g.size() / 2; i > 0; i /= 2) {
+  // Store value in shared memroy with thread ID
+  shared[group_thread_id] = val;
 
- // Synchronize all threads in the group
- g.sync();
+  // Synchronize all threads in the group
+  g.sync();
 
- // Active thread sum up
- if(group_thread_id < i)
- val += shared[group_thread_id + i];
+  // Active thread sum up
+  if(group_thread_id < i)
+  val += shared[group_thread_id + i];
 
- // Synchronize all threads in the group
- g.sync();
+  // Synchronize all threads in the group
+  g.sync();
  }
 
  // ...
  }
 
 **Original Block**
 
-.. code-block:: cpp
+.. code-block:: cuda
 
  __global__ void sum_kernel(...) {
- 
- // ...
- 
- // Workspace array in shared memory
- __shared__ unsigned int workspace[2048];
 
- // ...
+  // ...
 
- // Perform reduction
- output = reduce_sum(workspace, input);
+  // Workspace array in shared memory
+  __shared__ unsigned int workspace[2048];
 
- // ...
+ // ...
+
+ // Perform reduction
+ output = reduce_sum(workspace, input);
+
+ // ...
  }
 
 **Cooperative Groups**
 
-.. code-block:: cpp
+.. code-block:: cuda
 
  __global__ void sum_kernel(...) {
- // ...
 
- // Workspace array in shared memory
- __shared__ unsigned int workspace[2048];
+ // ...
 
- // ...
+ // Workspace array in shared memory
+ __shared__ unsigned int workspace[2048];
 
- thread_block thread_block_group = this_thread_block();
- // Perform reduction
- output = reduce_sum(thread_block_group, workspace, input);
+ // ...
+
+ thread_block thread_block_group = this_thread_block();
+ // Perform reduction
+ output = reduce_sum(thread_block_group, workspace, input);
 
- // ...
+  // ...
  }
 
 The kernel launch also differs with cooperative groups as it depends on the group type. For example, grid groups with a single-GPU, the ``hipLaunchCooperativeKernel`` has to be used.
 
 Group Types
 ===========
 
-Group types are based on the levels of synchronization and data sharing among threads. 
+Group types are based on the levels of synchronization and data sharing among threads.
 
 Thread-block group
 ------------------
 
 Represents an intra-workgroup cooperative group type where the participating threads within the group are the same threads that participated in the currently executing ``workgroup``.
 
 .. code-block:: cpp
- 
+
  class thread_block;
 
 Constructed via:
 
 .. code-block:: cpp
- 
+
  thread_block g = this_thread_block();
 
-The ``group_index()`` , ``thread_index()`` , ``thread_rank()`` , ``size()``, ``cg_type()``, 
-``is_valid()`` , ``sync()`` and ``group_dim()`` member functions are public of the 
-thread_block class. For further details check the :ref:`thread_block references <thread_block_ref>` . 
+The ``group_index()`` , ``thread_index()`` , ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` , ``sync()`` and ``group_dim()`` member functions are public of the thread_block class. For further details check the :ref:`thread_block references <thread_block_ref>` .
 
 Grid group
 ------------
@@ -162,7 +160,7 @@ Constructed via:
  grid_group g = this_grid();
 
 The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` and ``sync()`` member functions
-are public of the ``grid_group`` class. For further details check the :ref:`grid_group references <grid_group_ref>`. 
+are public of the ``grid_group`` class. For further details check the :ref:`grid_group references <grid_group_ref>`.
 
 Multi-grid group
 ------------------
@@ -182,7 +180,7 @@ Constructed via:
 
 The ``num_grids()`` , ``grid_rank()`` , ``thread_rank()``, ``size()``, ``cg_type()``, ``is_valid()`` ,
 and ``sync()`` member functions are public of the ``multi_grid_group`` class. For
-further details check the :ref:`multi_grid_group references <multi_grid_group_ref>` . 
+further details check the :ref:`multi_grid_group references <multi_grid_group_ref>` .
 
 Thread-block tile
 ------------------
@@ -198,20 +196,17 @@ size of the new thread group at compile time.
 Constructed via:
 
 .. code-block:: cpp
- 
+
  template <unsigned int Size, typename ParentT>
  _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
 
 
 .. note::
- 
+
  * ``Size`` must be a power of 2 and not larger than wavefront size.
  * ``shfl`` functions support integer or float type.
 
-The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, 
-``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, 
-``shfl_up(...)`` and ``shfl_xor(...)`` member functions are public of the ``thread_block_tile``
-class. For further details check the :ref:`thread_block_tile references <thread_block_tile_ref>` . 
+The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, ``shfl_up(...)`` and ``shfl_xor(...)`` member functions are public of the ``thread_block_tile`` class. For further details check the :ref:`thread_block_tile references <thread_block_tile_ref>` .
 
 Coalesced groups
 ------------------
@@ -230,13 +225,10 @@ Constructed via:
  coalesced_group active = coalesced_threads();
 
 .. note::
- 
+
  * ``shfl`` functions support integer or float type.
 
-The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, 
-``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, 
-and ``shfl_up(...)`` member functions are public of the ``coalesced_group`` class. 
-For more information, see :ref:`coalesced_group references <coalesced_group_ref>` .
+The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl(...)``, ``shfl_down(...)``, and ``shfl_up(...)`` member functions are public of the ``coalesced_group`` class. For more information, see :ref:`coalesced_group references <coalesced_group_ref>` .
 
 Synchronization
 =================
@@ -307,35 +299,35 @@ This section describes how to synchronize the group over multiple GPUs:
 1. Confirm the cooperative launch capability over multiple GPUs:
 
 .. code-block:: cpp
- 
+
  // Check support of cooperative groups
  std::vector<int> deviceIDs;
  for(int deviceID = 0; deviceID < device_count; deviceID++) {
  #ifdef __HIP_PLATFORM_AMD__
- int supports_coop_launch = 0;
- HIP_CHECK(
- hipDeviceGetAttribute(
- &supports_coop_launch, 
- hipDeviceAttributeCooperativeMultiDeviceLaunch, 
- deviceID));
- if(!supports_coop_launch) {
- std::cout << "Skipping, device " << deviceID << " does not support cooperative groups"
- << std::endl;
- }
- else
+  int supports_coop_launch = 0;
+  HIP_CHECK(
+  hipDeviceGetAttribute(
+  &supports_coop_launch,
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,
+  deviceID));
+  if(!supports_coop_launch) {
+  std::cout << "Skipping, device " << deviceID << " does not support cooperative groups"
+  << std::endl;
+  }
+  else
  #endif
- {
- std::cout << deviceID << std::endl;
- // Collect valid deviceIDs.
- deviceIDs.push_back(deviceID);
- }
+  {
+  std::cout << deviceID << std::endl;
+  // Collect valid deviceIDs.
+  deviceIDs.push_back(deviceID);
+  }
  }
 
  if(!deviceIDs.size())
  {
  std::cout << "No valid GPU found." << std::endl;
  } else {
-  std::cout << "Valid GPUs number:" << deviceIDs.size() << std::endl;
+ std::cout << "Valid GPUs number:" << deviceIDs.size() << std::endl;
  }
 
 2. Launch the cooperative kernel over multiple GPUs:
@@ -344,17 +336,18 @@ This section describes how to synchronize the group over multiple GPUs:
 
  hipLaunchParams *launchParamsList = (hipLaunchParams*)malloc(sizeof(hipLaunchParams) * deviceIDs.size());
  for(int deviceID : deviceIDs) {
- // Set device
- HIP_CHECK(hipSetDevice(deviceID));
- 
- // Create stream
+
+ // Set device
+ HIP_CHECK(hipSetDevice(deviceID));
+
+ // Create stream
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
- 
- // Parameters
- void* params[] = {&(d_vector[deviceID]), &(d_block_reduced[deviceID]), &(d_partition_reduced[deviceID])};
- 
- // Set launchParams
+
+ // Parameters
+ void* params[] = {&(d_vector[deviceID]), &(d_block_reduced[deviceID]), &(d_partition_reduced[deviceID])};
+
+ // Set launchParams
  launchParamsList[deviceID].func = (void*)vector_reduce_kernel<partition_size>;
  launchParamsList[deviceID].gridDim = dim3(1);
  launchParamsList[deviceID].blockDim = dim3(threads_per_block);
@@ -379,22 +372,17 @@ Unsupported NVIDIA CUDA features
 
 HIP doesn't support the following NVIDIA CUDA optional headers:
 
-.. code-block:: cpp
-
- // Optionally include for memcpy_async() collective
- #include <cooperative_groups/memcpy_async.h>
- // Optionally include for reduce() collective
- #include <cooperative_groups/reduce.h>
- // Optionally include for inclusive_scan() and exclusive_scan() collectives
- #include <cooperative_groups/scan.h>
+* ``cooperative_groups/memcpy_async.h``
+* ``cooperative_groups/reduce.h``
+* ``cooperative_groups/scan.h``
 
 HIP doesn't support the following CUDA class in ``cooperative_groups`` namespace:
 
 * ``cluster_group``
 
 HIP doesn't support the following CUDA functions/operators in ``cooperative_groups`` namespace:
 
-* ``synchronize`` 
+* ``synchronize``
 * ``memcpy_async``
 * ``wait`` and ``wait_prior``
 * ``barrier_arrive`` and ``barrier_wait``

diff --git a/docs/reference/cooperative_groups_reference.rst b/docs/reference/cooperative_groups_reference.rst
@@ -43,7 +43,7 @@ The following functions and classes are stored in the https://github.com/ROCm/cl
 
 .. doxygenclass:: cooperative_groups::multi_grid_group
  :members:
- 
+
 .. _thread_block_tile_ref:
 
 .. doxygenclass:: cooperative_groups::thread_block_tile