diff --git a/include/hip/hip_api.h b/include/hip/hip_api.h index a03717c..48e3fdb 100644 --- a/include/hip/hip_api.h +++ b/include/hip/hip_api.h @@ -25,6 +25,18 @@ #include // BEGIN INTRINSICS +inline +std::int32_t __all(std::int32_t predicate) noexcept +{ + return hip::detail::all(predicate); +} + +inline +std::int32_t __any(std::int32_t predicate) noexcept +{ + return hip::detail::any(predicate); +} + inline std::uint64_t __ballot(std::int32_t predicate) noexcept { @@ -113,7 +125,7 @@ template< typename T, std::enable_if_t< (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + sizeof(T) <= 8>* = nullptr> inline T __shfl(T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept { @@ -124,7 +136,7 @@ template< typename T, std::enable_if_t< (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + sizeof(T) <= 8>* = nullptr> inline T __shfl_down( T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept @@ -136,7 +148,7 @@ template< typename T, std::enable_if_t< (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + sizeof(T) <= 8>* = nullptr> inline T __shfl_up( T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept @@ -148,7 +160,7 @@ template< typename T, std::enable_if_t< (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + sizeof(T) <= 8>* = nullptr> inline T __shfl_xor( T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept diff --git a/src/include/hip/detail/intrinsics.hpp b/src/include/hip/detail/intrinsics.hpp index f8a4c40..be41516 100644 --- a/src/include/hip/detail/intrinsics.hpp +++ b/src/include/hip/detail/intrinsics.hpp @@ -1,5 +1,5 @@ /* ----------------------------------------------------------------------------- - * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. * See 'LICENSE' in the project root for license information. * -------------------------------------------------------------------------- */ #pragma once @@ -31,13 +31,65 @@ namespace hip std::uint64_t ballot(std::int32_t x) noexcept { const auto tidx{id(Fiber::this_fiber()) % warpSize}; - auto& lds{Tile::scratchpad, 1>()[0]}; - lds[tidx] = static_cast(x); + Tile::predicate()[tidx] = x; - barrier(Tile::this_tile()); + Tile::this_tile().barrier(); + + const auto r{Tile::predicate().to_ullong()}; + + Tile::this_tile().barrier(); - return lds.to_ullong(); + return r; + } + + template>* = nullptr> + inline + std::uint32_t pop_count(T x) noexcept + { + [[maybe_unused]] + constexpr auto popcnt{[](auto&& x) constexpr noexcept { + return std::bitset(x).count(); + }}; + + if constexpr (sizeof(T) == sizeof(std::uint32_t)) { + #if defined(_MSC_VER) + return __popcnt(x); + #elif defined(__has_builtin) + #if __has_builtin(__builtin_popcount) + return __builtin_popcount(x); + #else + return popcnt(x); + #endif + #else + return popcnt(x); + #endif + } + else { + #if defined(_MSC_VER) + return static_cast(__popcnt64(x)); + #elif defined(__has_builtin) + #if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(x); + #else + return popcnt(x); + #endif + #else + return popcnt(x); + #endif + } + } + + inline + std::int32_t all(std::int32_t x) noexcept + { + return pop_count(ballot(x)) == warpSize; + } + + inline + std::int32_t any(std::int32_t x) noexcept + { + return pop_count(ballot(x)) > 0; } template< @@ -151,48 +203,7 @@ namespace hip } } - template>* = nullptr> - inline - std::uint32_t pop_count(T x) noexcept - { - [[maybe_unused]] - constexpr auto popcnt{[](auto&& x) constexpr noexcept { - return std::bitset(x).count(); - }}; - - if constexpr (sizeof(T) == sizeof(std::uint32_t)) { - #if defined(_MSC_VER) - return __popcnt(x); - #elif defined(__has_builtin) - #if __has_builtin(__builtin_popcount) - return __builtin_popcount(x); - #else - return popcnt(x); - #endif - #else - return popcnt(x); - #endif - } - else { - #if defined(_MSC_VER) - return static_cast(__popcnt64(x)); - #elif defined(__has_builtin) - #if __has_builtin(__builtin_popcountll) - return __builtin_popcountll(x); - #else - return popcnt(x); - #endif - #else - return popcnt(x); - #endif - } - } - - template< - typename T, - std::enable_if_t< - (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + template inline T shuffle(T x, std::int32_t src, std::int32_t w) noexcept { @@ -203,18 +214,19 @@ namespace hip Tile::this_tile().barrier(); const auto sidx{(tidx / w * w) + src}; + const auto r{ + (src < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]}; + + Tile::this_tile().barrier(); - return (src < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]; + return r; } - template< - typename T, - std::enable_if_t< - (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + template inline T shuffle_down(T x, std::int32_t dx, std::int32_t w) noexcept { // TODO: incorrect with large negative offsets, revisit. + // TODO: should probably consider using partial barriers. const auto tidx{id(Fiber::this_fiber()) % warpSize}; Tile::scratchpad()[tidx] = x; @@ -222,18 +234,19 @@ namespace hip Tile::this_tile().barrier(); const auto sidx{(tidx / w * w) + (tidx % w) + dx}; + const auto r{ + (sidx < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]}; + + Tile::this_tile().barrier(); - return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]; + return r; } - template< - typename T, - std::enable_if_t< - (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + template inline T shuffle_up(T x, std::int32_t dx, std::int32_t w) noexcept { // TODO: incorrect with large negative offsets, revisit. + // TODO: should probably consider using partial barriers. const auto tidx{id(Fiber::this_fiber()) % warpSize}; Tile::scratchpad()[tidx] = x; @@ -241,18 +254,19 @@ namespace hip Tile::this_tile().barrier(); const auto sidx{(tidx / w * w) + (tidx % w) - dx}; + const auto r{ + (sidx < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]}; - return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad()[sidx]; + Tile::this_tile().barrier(); + + return r; } - template< - typename T, - std::enable_if_t< - (std::is_integral_v || std::is_floating_point_v) && - (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr> + template inline T shuffle_xor(T x, std::int32_t src, std::int32_t w) noexcept { // TODO: probably incorrect, revisit. + // TODO: should probably consider using partial barriers. const auto tidx{id(Fiber::this_fiber()) % warpSize}; Tile::scratchpad()[tidx] = x; @@ -260,8 +274,11 @@ namespace hip Tile::this_tile().barrier(); const auto sidx{((tidx / w * w) + (tidx % w)) ^ src}; + const auto r{(src < 0) ? x : Tile::scratchpad()[sidx]}; + + Tile::this_tile().barrier(); - return (src < 0) ? x : Tile::scratchpad()[sidx]; + return r; } inline diff --git a/src/include/hip/detail/tile.hpp b/src/include/hip/detail/tile.hpp index 9d0afc9..e52a201 100644 --- a/src/include/hip/detail/tile.hpp +++ b/src/include/hip/detail/tile.hpp @@ -1,5 +1,5 @@ /* ----------------------------------------------------------------------------- - * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. * See 'LICENSE' in the project root for license information. * -------------------------------------------------------------------------- */ #pragma once @@ -16,6 +16,8 @@ #include "../../../../include/hip/hip_constants.h" #include +#include +#include #include #include #include @@ -101,9 +103,11 @@ namespace hip static void for_each_fiber( const F& fn, const std::tuple& args) noexcept; + static + std::bitset& predicate() noexcept; template static - decltype(auto) scratchpad() noexcept; + std::array& scratchpad() noexcept; static const Tile& this_tile() noexcept; @@ -164,15 +168,19 @@ namespace hip Fiber::this_fiber_().set_id_(0); } + inline + std::bitset& Tile::predicate() noexcept + { + return scratchpad, 1>()[0]; + } + template inline - decltype(auto) Tile::scratchpad() noexcept + std::array& Tile::scratchpad() noexcept { // TODO: use named variable for maximum block size. - thread_local static T r[1024 / warpSize][n]; - - const auto widx{id(hip::detail::Fiber::this_fiber()) / warpSize}; + thread_local static std::array r[1024 / warpSize]; - return (r[widx]); + return (r[id(Fiber::this_fiber()) / warpSize]); } inline diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b0d8239..6b5d72b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -31,6 +31,7 @@ add_executable( legacy_tests catch_main.cpp hip_atomics.cpp + hip_device_all_any.cpp hip_device_ballot.cpp hip_device_clock.cpp hip_device_clz.cpp @@ -71,6 +72,7 @@ add_executable( target_link_libraries(legacy_tests PRIVATE tests_common) add_test(NAME "legacy_atomic" COMMAND legacy_tests [device][atomic]) +add_test(NAME "legacy_any_all" COMMAND legacy_tests [device][all][any]) add_test(NAME "legacy_ballot" COMMAND legacy_tests [device][ballot]) add_test(NAME "legacy_clock" COMMAND legacy_tests [device][clock]) add_test(NAME "legacy_clz" COMMAND legacy_tests [device][clz]) diff --git a/tests/hip_device_all_any.cpp b/tests/hip_device_all_any.cpp new file mode 100644 index 0000000..c8910c9 --- /dev/null +++ b/tests/hip_device_all_any.cpp @@ -0,0 +1,101 @@ +/* ----------------------------------------------------------------------------- + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. + * See 'LICENSE' in the project root for license information. + * -------------------------------------------------------------------------- */ +#include + +#include "../external/catch2/catch.hpp" + +#include +#include + +using namespace std; + +__global__ +void warpvote(int* device_any, int* device_all, int pshift) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + device_any[threadIdx.x >> pshift] = __any(tid - 77); + device_all[threadIdx.x >> pshift] = __all(tid - 77); +} + +TEST_CASE("Unit_AnyAll_CompileTest", "[device][all][any]") +{ + auto w{warpSize}; + auto pshift{0}; + + while (w >>= 1) ++pshift; + + INFO("WarpSize: " << warpSize << " pShift: " << pshift); + + auto anycount{0}; + auto allcount{0}; + auto Num_Threads_per_Block{1024}; + auto Num_Blocks_per_Grid{1}; + auto Num_Warps_per_Grid{ + (Num_Threads_per_Block * Num_Blocks_per_Grid) / warpSize}; + + auto host_any{static_cast(malloc(Num_Warps_per_Grid * sizeof(int)))}; + auto host_all{static_cast(malloc(Num_Warps_per_Grid * sizeof(int)))}; + + int* device_any{}; + int* device_all{}; + + REQUIRE( + hipMalloc(&device_any, Num_Warps_per_Grid * sizeof(int)) == hipSuccess); + REQUIRE( + hipMalloc(&device_all, Num_Warps_per_Grid * sizeof(int)) == hipSuccess); + + fill_n(host_any, Num_Warps_per_Grid, 0); + fill_n(host_all, Num_Warps_per_Grid, 0); + + REQUIRE(hipMemcpy( + device_any, + host_any, + sizeof(int), + hipMemcpyHostToDevice) == hipSuccess); + REQUIRE(hipMemcpy( + device_all, + host_all, + sizeof(int), + hipMemcpyHostToDevice) == hipSuccess); + + hipLaunchKernelGGL( + warpvote, + dim3(Num_Blocks_per_Grid), + dim3(Num_Threads_per_Block), + 0, + nullptr, + device_any, + device_all, + pshift); + + REQUIRE(hipGetLastError() == hipSuccess); + REQUIRE(hipMemcpy( + host_any, + device_any, + Num_Warps_per_Grid * sizeof(int), + hipMemcpyDeviceToHost) == hipSuccess); + REQUIRE(hipMemcpy( + host_all, + device_all, + Num_Warps_per_Grid * sizeof(int), + hipMemcpyDeviceToHost) == hipSuccess); + + for (int i = 0; i < Num_Warps_per_Grid; i++) { + INFO( + "Warp Number: " << i << " __any: " << host_any[i] + << " __all: " << host_all[i]); + + if (host_all[i] != 1) ++allcount; + if (host_any[i] != 1) { + ++anycount; + } + } + + REQUIRE(hipFree(device_any) == hipSuccess); + REQUIRE(hipFree(device_all) == hipSuccess); + + REQUIRE(anycount == 0); + REQUIRE(allcount == 1); +} \ No newline at end of file diff --git a/tests/hip_device_shfl.cpp b/tests/hip_device_shfl.cpp index 04f52b7..cd47c07 100644 --- a/tests/hip_device_shfl.cpp +++ b/tests/hip_device_shfl.cpp @@ -1,7 +1,8 @@ /* ----------------------------------------------------------------------------- - * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. * See 'LICENSE' in the project root for license information. * -------------------------------------------------------------------------- */ +#include #include #include "../external/catch2/catch.hpp" @@ -128,4 +129,216 @@ TEMPLATE_TEST_CASE( REQUIRE(hipFree(GPUTransposeMatrix) == hipSuccess); // TODO: add tests for OOB indices. +} + +template +__global__ +void shflDownSum(T* a, int size) +{ + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_down(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ +void shflUpSum(T* a, int size) +{ + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_up(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ +void shflXorSum(T* a, int size) +{ + T val = a[threadIdx.x]; + for (int i = size/2; i > 0; i /= 2) { + val += __shfl_xor(val, i, size); + } + a[threadIdx.x] = val; +} + +inline +void getFactor(int* fact) +{ + *fact = 101; +} +inline +void getFactor(unsigned int* fact) +{ + *fact = static_cast(INT32_MAX) + 1; +} +inline +void getFactor(float* fact) +{ + *fact = 2.5; +} +inline +void getFactor(double* fact) +{ + *fact = 2.5; +} +inline +void getFactor(__half* fact) +{ + *fact = 2.5; +} +inline +void getFactor(int64_t* fact) +{ + *fact = 303; +} +inline void getFactor(uint64_t* fact) +{ + *fact = static_cast(__LONG_LONG_MAX__) + 1; +} + +constexpr int sz{32}; + +template +inline +T sum(T* a) +{ + T cpuSum = 0; + T factor; + getFactor(&factor); + for (int i = 0; i < sz; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + return cpuSum; +} + +template<> +inline +__half sum(__half* a) +{ + __half cpuSum{0}; + __half factor; + getFactor(&factor); + for (int i = 0; i < sz; i++) { + a[i] = i + __half2float(factor); + cpuSum = __half2float(cpuSum) + __half2float(a[i]); + } + return cpuSum; +} + +template +inline +bool compare(T gpuSum, T cpuSum) +{ + if (gpuSum != cpuSum) { + return true; + } + return false; +} + +template<> +inline +bool compare(__half gpuSum, __half cpuSum) +{ + if (__half2float(gpuSum) != __half2float(cpuSum)) { + return true; + } + return false; +} + +template +inline +void runTestShflUp() +{ + constexpr int size{32}; + T a[size]; + T cpuSum = sum(a); + T* d_a{}; + + REQUIRE(hipMalloc(&d_a, sizeof(T) * size) == hipSuccess); + REQUIRE( + hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault) == hipSuccess); + + hipLaunchKernelGGL(shflUpSum, 1, size, 0, 0, d_a, size); + + REQUIRE( + hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault) == hipSuccess); + REQUIRE((compare(a[size - 1], cpuSum)) == 0); + REQUIRE(hipFree(d_a) == hipSuccess); +} + +template +inline +void runTestShflDown() +{ + T a[sz]; + T cpuSum = sum(a); + T* d_a; + + REQUIRE(hipMalloc(&d_a, sizeof(T) * sz) == hipSuccess); + REQUIRE(hipMemcpy(d_a, &a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess); + + hipLaunchKernelGGL(shflDownSum, 1, sz, 0, 0, d_a, sz); + + REQUIRE(hipMemcpy(&a, d_a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess); + REQUIRE((compare(a[0], cpuSum)) == 0); + REQUIRE(hipFree(d_a) == hipSuccess); +} + +template +inline +void runTestShflXor() +{ + T a[sz]; + T cpuSum = sum(a); + T* d_a; + + REQUIRE(hipMalloc(&d_a, sizeof(T) * sz) == hipSuccess); + REQUIRE(hipMemcpy(d_a, &a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess); + + hipLaunchKernelGGL(shflXorSum, 1, sz, 0, 0, d_a, sz); + + REQUIRE(hipMemcpy(&a, d_a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess); + REQUIRE((compare(a[0], cpuSum)) == 0); + REQUIRE(hipFree(d_a) == hipSuccess); +} + +TEST_CASE("Unit_runTestShfl_up", "[device][shfl]") +{ + SECTION("runTestShflUp for int") { runTestShflUp(); } + SECTION("runTestShflUp for float") { runTestShflUp(); } + SECTION("runTestShflUp for double") { runTestShflUp(); } + SECTION("runTestShflUp for __half") { runTestShflUp<__half>(); } + SECTION("runTestShflUp for int64_t") { runTestShflUp(); } + SECTION("runTestShflUp for unsigned int") { runTestShflUp(); } + SECTION("runTestShflUp for uint64_t") { runTestShflUp(); } +} + +TEST_CASE("Unit_runTestShfl_Down", "[device][shfl]") +{ + SECTION("runTestShflDown for int") { runTestShflDown(); } + SECTION("runTestShflDown for float") { runTestShflDown(); } + SECTION("runTestShflDown for double") { runTestShflDown(); } + SECTION("runTestShflDown for __half") { runTestShflDown<__half>(); } + SECTION("runTestShflDown for int64_t") { runTestShflDown(); } + SECTION("runTestShflDown for unsigned int") { + runTestShflDown(); + } + SECTION("runTestShflDown for uint64_t") { runTestShflDown(); } +} + +TEST_CASE("Unit_runTestShfl_Xor", "[device][shfl]") +{ + SECTION("runTestShflXor for int") { runTestShflXor(); } + SECTION("runTestShflXor for float") { runTestShflXor(); } + SECTION("runTestShflXor for double") { runTestShflXor(); } + SECTION("runTestShflXor for __half") { runTestShflXor<__half>(); } + SECTION("runTestShflXor for int64_t") { runTestShflXor(); } + SECTION("runTestShflXor for unsigned int") { + runTestShflXor(); + } + SECTION("runTestShflXor for uint64_t") { runTestShflXor(); } } \ No newline at end of file