Update compile-time shared memory usage check for device_partition (#543

) * Update compile-time shared memory usage check for device_partition The device partition algorithm uses a default_select_config struct to detect which architecture we're running on. The default_select_config struct eventually causes the creation of a struct of type limit_block_size. This struct is used to detect if the launch configuration that's being used (block size and amount of shared memory per thread) will cause on the selected device to use more than 32 KiB of shared memory. If so, then limit_block_size attempts to reduce the block size (divides it by 2) and checks the shared memory usage again. If the element type is large enough, it is possible to get into a situation where, even if we use the minimum block size (a single wavefront of threads) and give the threads the minimum possible number of elements to work on (1 each), we will still use more than 32 KiB of shared memory. The limit_block_size struct assumes that the amount of shared memory that will be used is equal to the block size multiplied by the amount of memory required per thread. However, the device partition algorithm actually requires slightly more shared memory than this, because it does an extra allocation to store the lookback scan's state. It's not really feasible to move this lookback scan state out of shared memory because all threads in the block need access to it. This change modifies the limit_block_size struct so that it accepts an "ExtraSharedMemory" template parameter, and updates the shared memory check it performs so that it takes this value into account. It also updates the device partition's config-creating code so that it passes in the size of the lookback scan state. * Add device partition unit test to check behaviour around shared memory limit Test the edge case where the data passed to the device partition algorithm will consume the maximum allowable amount of shared memory. Since the algorithm itself also requires some shared memory to store state, this should push us over the max limit. In this case, the block size should be reduced to compensate. --------- Co-authored-by: root <[email protected]>
ROCm · Apr 4, 2024 · 609ae19 · 609ae19
1 parent ea1bef6
commit 609ae19
Show file tree

Hide file tree

Showing 4 changed files with 187 additions and 22 deletions.
diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp
@@ -100,10 +100,13 @@ template<
  unsigned int SharedMemoryPerThread,
  // Most kernels require block sizes not smaller than warp
  unsigned int MinBlockSize, 
+ // If kernels require more than MaxBlockSize * SharedMemoryPerThread bytes
+ // (eg. to store some kind of block-wide state), that size can be specified here
+ unsigned int ExtraSharedMemory = 0,
  // Can fit in shared memory?
  // Although GPUs have 64KiB, 32KiB is used here as a "soft" limit,
  // because some additional memory may be required in kernels
- bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15))
+ bool = (MaxBlockSize * SharedMemoryPerThread + ExtraSharedMemory <= (1u << 15))
 >
 struct limit_block_size
 {
@@ -112,16 +115,18 @@ struct limit_block_size
  limit_block_size<
  detail::next_power_of_two(MaxBlockSize) / 2,
  SharedMemoryPerThread,
- MinBlockSize
+ MinBlockSize,
+ ExtraSharedMemory
  >::value;
 };
 
 template<
  unsigned int MaxBlockSize,
  unsigned int SharedMemoryPerThread,
- unsigned int MinBlockSize
+ unsigned int MinBlockSize,
+ unsigned int ExtraSharedMemory
 >
-struct limit_block_size<MaxBlockSize, SharedMemoryPerThread, MinBlockSize, true>
+struct limit_block_size<MaxBlockSize, SharedMemoryPerThread, MinBlockSize, ExtraSharedMemory, true>
 {
  static_assert(MaxBlockSize >= MinBlockSize, "Data is too large, it cannot fit in shared memory");
 

diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp
@@ -149,15 +149,18 @@ hipError_t partition_impl(void * temporary_storage,
  using key_type = typename std::iterator_traits<KeyIterator>::value_type;
  using value_type = typename std::iterator_traits<ValueIterator>::value_type;
 
+ using offset_scan_state_type = detail::lookback_scan_state<offset_type>;
+ using offset_scan_state_with_sleep_type = detail::lookback_scan_state<offset_type, true>;
+
  // Get default config if Config is default_config
  using config = default_or_custom_config<
  Config,
- default_select_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+ // Note: the partition algorithm requires some extra shared memory space for an instance of 
+ // offset_scan_state_type. Pass it's size to default_select_config here so that it can select
+ // an appropriate block size (one that ensures that we don't run out of shared memory).
+ default_select_config<ROCPRIM_TARGET_ARCH, key_type, value_type, sizeof(offset_scan_state_type)>
  >;
 
- using offset_scan_state_type = detail::lookback_scan_state<offset_type>;
- using offset_scan_state_with_sleep_type = detail::lookback_scan_state<offset_type, true>;
-
  static constexpr unsigned int block_size = config::block_size;
  static constexpr unsigned int items_per_thread = config::items_per_thread;
  static constexpr auto items_per_block = block_size * items_per_thread;

diff --git a/rocprim/include/rocprim/device/device_select_config.hpp b/rocprim/include/rocprim/device/device_select_config.hpp
@@ -76,14 +76,14 @@ struct select_config
 namespace detail
 {
 
-template<class Key>
+template<class Key, unsigned int ExtraSharedMemory>
 struct select_config_803
 {
  static constexpr unsigned int item_scale =
  ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
 
  using type = select_config<
- limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
+ limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value,
  ::rocprim::max(1u, 13u / item_scale),
  ::rocprim::block_load_method::block_load_transpose,
  ::rocprim::block_load_method::block_load_transpose,
@@ -92,14 +92,14 @@ struct select_config_803
  >;
 };
 
-template<class Key>
+template<class Key, unsigned int ExtraSharedMemory>
 struct select_config_900
 {
  static constexpr unsigned int item_scale =
  ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
 
  using type = select_config<
- limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
+ limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value,
  ::rocprim::max(1u, 15u / item_scale),
  ::rocprim::block_load_method::block_load_transpose,
  ::rocprim::block_load_method::block_load_transpose,
@@ -108,14 +108,14 @@ struct select_config_900
  >;
 };
 
-template<class Value>
+template<class Value, unsigned int ExtraSharedMemory>
 struct select_config_90a
 {
  static constexpr unsigned int item_scale =
  ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
 
  using type = select_config<
- limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+ limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value,
  ::rocprim::max(1u, 15u / item_scale),
  ::rocprim::block_load_method::block_load_transpose,
  ::rocprim::block_load_method::block_load_transpose,
@@ -124,14 +124,14 @@ struct select_config_90a
  >;
 };
 
-template<class Value>
+template<class Value, unsigned int ExtraSharedMemory>
 struct select_config_1030
 {
  static constexpr unsigned int item_scale =
  ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
 
  using type = select_config<
- limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+ limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32, ExtraSharedMemory>::value,
  ::rocprim::max(1u, 15u / item_scale),
  ::rocprim::block_load_method::block_load_transpose,
  ::rocprim::block_load_method::block_load_transpose,
@@ -141,15 +141,15 @@ struct select_config_1030
 };
 
 
-template<unsigned int TargetArch, class Key, class /*Value*/>
+template<unsigned int TargetArch, class Key, class /*Value*/, unsigned int ExtraSharedMemory = 0>
 struct default_select_config
  : select_arch<
  TargetArch,
- select_arch_case<803, select_config_803<Key>>,
- select_arch_case<900, select_config_900<Key>>,
- select_arch_case<ROCPRIM_ARCH_90a, select_config_90a<Key>>,
- select_arch_case<1030, select_config_1030<Key>>,
- select_config_803<Key>
+ select_arch_case<803, select_config_803<Key, ExtraSharedMemory>>,
+ select_arch_case<900, select_config_900<Key, ExtraSharedMemory>>,
+ select_arch_case<ROCPRIM_ARCH_90a, select_config_90a<Key, ExtraSharedMemory>>,
+ select_arch_case<1030, select_config_1030<Key, ExtraSharedMemory>>,
+ select_config_803<Key, ExtraSharedMemory>
  > { };
 
 } // end namespace detail

diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp
@@ -1464,3 +1464,160 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay)
  HIP_CHECK(hipStreamDestroy(stream));
  }
 }
+
+// This test checks to make sure that the block size is reduced correctly
+// when our data size and type are set in a way that we will exceed the shared
+// memory limit. Since the block size calculation is done at compile time,
+// if the block size is not correctly reduced, this test will fail to compile.
+TEST(RocprimDevicePartitionBlockSizeTests, BlockSize)
+{
+ int device_id = test_common_utils::obtain_device_from_ctest();
+ SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+ HIP_CHECK(hipSetDevice(device_id));
+
+ // Create a large struct to test with. It must be big enough that when
+ // we the use default block size (defined in rocprim::default_select_config 
+ // struct as 256), giving one instance to each thread will cause us to hit
+ // 32 KiB of shared memory (the limit enforced by the rocprim::limit_block_size
+ // struct's boolean template parameter). Since the device_partition algorithm also
+ // uses some shared memory to store state, this will cause the total usage to exceed
+ // the 32 KiB limit. If everything's working correctly, this should be detected in 
+ // the limit_block_size's template logic, and it should reduce the block size.
+ const size_t test_obj_size = 128; // Choose 128, since 256 * 128 = 2^15 bytes (32 KiB).
+ struct TestObject
+ {
+ unsigned char data[test_obj_size];
+
+ bool operator==(const TestObject& other) const
+ {
+ bool equal = true;
+ for (size_t i = 0; equal && i < test_obj_size; i++)
+ equal = data[i] == other.data[i];
+
+ return equal;
+ }
+ };
+
+ using T = TestObject; // input data type
+ using U = TestObject; // output data type
+ const bool debug_synchronous = false;
+ const hipStream_t stream = 0; // default stream
+
+ auto select_op = [] __host__ __device__ (const T& value) -> bool
+ {
+ // The data values are in [0, 255]. Partition on the midpoint.
+ if(value.data[0] == 128) return true;
+ return false;
+ };
+
+ // Use some power of two and off-by-one-from-power-of-two data sizes.
+ const std::vector<size_t> sizes = {256, 257, 511, 512, 1024, 1025};
+
+ for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+ {
+ unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+ SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+ for(auto size : sizes)
+ {
+ SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+ // Generate data
+ std::vector<unsigned char> input_data = test_utils::get_random_data<unsigned char>(size * test_obj_size, 0, 255, seed_value);
+ std::vector<T> input(size);
+ for (size_t i = 0; i < size; i++)
+ memcpy(input[i].data, input_data.data() + i * test_obj_size, test_obj_size);
+
+ T * d_input;
+ U * d_output;
+ unsigned int * d_selected_count_output;
+ HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T)));
+ HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U)));
+ HIP_CHECK(test_common_utils::hipMallocHelper(&d_selected_count_output, sizeof(unsigned int)));
+ HIP_CHECK(
+ hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
+
+ // Calculate expected_selected and expected_rejected results on host
+ std::vector<U> expected_selected;
+ std::vector<U> expected_rejected;
+ expected_selected.reserve(input.size()/2);
+ expected_rejected.reserve(input.size()/2);
+ for(size_t i = 0; i < input.size(); i++)
+ {
+ if(select_op(input[i]))
+ {
+ expected_selected.push_back(input[i]);
+ }
+ else
+ {
+ expected_rejected.push_back(input[i]);
+ }
+ }
+ std::reverse(expected_rejected.begin(), expected_rejected.end());
+
+ // temp storage
+ size_t temp_storage_size_bytes;
+ // Get size of d_temp_storage
+ HIP_CHECK(rocprim::partition(
+ nullptr,
+ temp_storage_size_bytes,
+ d_input,
+ d_output,
+ d_selected_count_output,
+ input.size(),
+ select_op,
+ stream,
+ debug_synchronous));
+
+ // temp_storage_size_bytes must be >0
+ ASSERT_GT(temp_storage_size_bytes, 0);
+
+ // allocate temporary storage
+ void* d_temp_storage = nullptr;
+ HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes));
+
+ // Run
+ HIP_CHECK(rocprim::partition(
+ d_temp_storage,
+ temp_storage_size_bytes,
+ d_input,
+ d_output,
+ d_selected_count_output,
+ input.size(),
+ select_op,
+ stream,
+ debug_synchronous));
+
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Check if number of selected value is as expected_selected
+ unsigned int selected_count_output = 0;
+ HIP_CHECK(hipMemcpy(&selected_count_output,
+ d_selected_count_output,
+ sizeof(unsigned int),
+ hipMemcpyDeviceToHost));
+ ASSERT_EQ(selected_count_output, expected_selected.size());
+
+ // Check if output values are as expected_selected
+ std::vector<U> output(input.size());
+ HIP_CHECK(hipMemcpy(output.data(),
+ d_output,
+ output.size() * sizeof(U),
+ hipMemcpyDeviceToHost));
+
+ std::vector<U> output_rejected;
+ for(size_t i = 0; i < expected_rejected.size(); i++)
+ {
+ auto j = i + expected_selected.size();
+ output_rejected.push_back(output[j]);
+ }
+ ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected_selected, expected_selected.size()));
+ ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_rejected, expected_rejected, expected_rejected.size()));
+
+ hipFree(d_input);
+ hipFree(d_output);
+ hipFree(d_selected_count_output);
+ hipFree(d_temp_storage);
+ }
+ }
+}