Skip to content

Commit

Permalink
Add __all and __any, fix Missing some warp cross lane functions #25.
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexVlx committed Sep 1, 2023
1 parent bf8aebc commit 4aa3871
Show file tree
Hide file tree
Showing 6 changed files with 431 additions and 78 deletions.
20 changes: 16 additions & 4 deletions include/hip/hip_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@
#include <utility>

// BEGIN INTRINSICS
inline
std::int32_t __all(std::int32_t predicate) noexcept
{
return hip::detail::all(predicate);
}

inline
std::int32_t __any(std::int32_t predicate) noexcept
{
return hip::detail::any(predicate);
}

inline
std::uint64_t __ballot(std::int32_t predicate) noexcept
{
Expand Down Expand Up @@ -113,7 +125,7 @@ template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
sizeof(T) <= 8>* = nullptr>
inline
T __shfl(T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept
{
Expand All @@ -124,7 +136,7 @@ template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
sizeof(T) <= 8>* = nullptr>
inline
T __shfl_down(
T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept
Expand All @@ -136,7 +148,7 @@ template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
sizeof(T) <= 8>* = nullptr>
inline
T __shfl_up(
T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept
Expand All @@ -148,7 +160,7 @@ template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
sizeof(T) <= 8>* = nullptr>
inline
T __shfl_xor(
T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept
Expand Down
149 changes: 83 additions & 66 deletions src/include/hip/detail/intrinsics.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* -----------------------------------------------------------------------------
* Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
* Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
* See 'LICENSE' in the project root for license information.
* -------------------------------------------------------------------------- */
#pragma once
Expand Down Expand Up @@ -31,13 +31,65 @@ namespace hip
std::uint64_t ballot(std::int32_t x) noexcept
{
const auto tidx{id(Fiber::this_fiber()) % warpSize};
auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};

lds[tidx] = static_cast<bool>(x);
Tile::predicate()[tidx] = x;

barrier(Tile::this_tile());
Tile::this_tile().barrier();

const auto r{Tile::predicate().to_ullong()};

Tile::this_tile().barrier();

return lds.to_ullong();
return r;
}

template<typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
inline
std::uint32_t pop_count(T x) noexcept
{
[[maybe_unused]]
constexpr auto popcnt{[](auto&& x) constexpr noexcept {
return std::bitset<sizeof(T) * CHAR_BIT>(x).count();
}};

if constexpr (sizeof(T) == sizeof(std::uint32_t)) {
#if defined(_MSC_VER)
return __popcnt(x);
#elif defined(__has_builtin)
#if __has_builtin(__builtin_popcount)
return __builtin_popcount(x);
#else
return popcnt(x);
#endif
#else
return popcnt(x);
#endif
}
else {
#if defined(_MSC_VER)
return static_cast<std::uint32_t>(__popcnt64(x));
#elif defined(__has_builtin)
#if __has_builtin(__builtin_popcountll)
return __builtin_popcountll(x);
#else
return popcnt(x);
#endif
#else
return popcnt(x);
#endif
}
}

inline
std::int32_t all(std::int32_t x) noexcept
{
return pop_count(ballot(x)) == warpSize;
}

inline
std::int32_t any(std::int32_t x) noexcept
{
return pop_count(ballot(x)) > 0;
}

template<
Expand Down Expand Up @@ -151,48 +203,7 @@ namespace hip
}
}

template<typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
inline
std::uint32_t pop_count(T x) noexcept
{
[[maybe_unused]]
constexpr auto popcnt{[](auto&& x) constexpr noexcept {
return std::bitset<sizeof(T) * CHAR_BIT>(x).count();
}};

if constexpr (sizeof(T) == sizeof(std::uint32_t)) {
#if defined(_MSC_VER)
return __popcnt(x);
#elif defined(__has_builtin)
#if __has_builtin(__builtin_popcount)
return __builtin_popcount(x);
#else
return popcnt(x);
#endif
#else
return popcnt(x);
#endif
}
else {
#if defined(_MSC_VER)
return static_cast<std::uint32_t>(__popcnt64(x));
#elif defined(__has_builtin)
#if __has_builtin(__builtin_popcountll)
return __builtin_popcountll(x);
#else
return popcnt(x);
#endif
#else
return popcnt(x);
#endif
}
}

template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
template<typename T>
inline
T shuffle(T x, std::int32_t src, std::int32_t w) noexcept
{
Expand All @@ -203,65 +214,71 @@ namespace hip
Tile::this_tile().barrier();

const auto sidx{(tidx / w * w) + src};
const auto r{
(src < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};

Tile::this_tile().barrier();

return (src < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
return r;
}

template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
template<typename T>
inline
T shuffle_down(T x, std::int32_t dx, std::int32_t w) noexcept
{ // TODO: incorrect with large negative offsets, revisit.
// TODO: should probably consider using partial barriers.
const auto tidx{id(Fiber::this_fiber()) % warpSize};

Tile::scratchpad<T>()[tidx] = x;

Tile::this_tile().barrier();

const auto sidx{(tidx / w * w) + (tidx % w) + dx};
const auto r{
(sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};

Tile::this_tile().barrier();

return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
return r;
}

template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
template<typename T>
inline
T shuffle_up(T x, std::int32_t dx, std::int32_t w) noexcept
{ // TODO: incorrect with large negative offsets, revisit.
// TODO: should probably consider using partial barriers.
const auto tidx{id(Fiber::this_fiber()) % warpSize};

Tile::scratchpad<T>()[tidx] = x;

Tile::this_tile().barrier();

const auto sidx{(tidx / w * w) + (tidx % w) - dx};
const auto r{
(sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};

return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
Tile::this_tile().barrier();

return r;
}

template<
typename T,
std::enable_if_t<
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
template<typename T>
inline
T shuffle_xor(T x, std::int32_t src, std::int32_t w) noexcept
{ // TODO: probably incorrect, revisit.
// TODO: should probably consider using partial barriers.
const auto tidx{id(Fiber::this_fiber()) % warpSize};

Tile::scratchpad<T>()[tidx] = x;

Tile::this_tile().barrier();

const auto sidx{((tidx / w * w) + (tidx % w)) ^ src};
const auto r{(src < 0) ? x : Tile::scratchpad<T>()[sidx]};

Tile::this_tile().barrier();

return (src < 0) ? x : Tile::scratchpad<T>()[sidx];
return r;
}

inline
Expand Down
22 changes: 15 additions & 7 deletions src/include/hip/detail/tile.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* -----------------------------------------------------------------------------
* Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
* Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
* See 'LICENSE' in the project root for license information.
* -------------------------------------------------------------------------- */
#pragma once
Expand All @@ -16,6 +16,8 @@
#include "../../../../include/hip/hip_constants.h"

#include <algorithm>
#include <array>
#include <bitset>
#include <execution>
#include <functional>
#include <cstdint>
Expand Down Expand Up @@ -101,9 +103,11 @@ namespace hip
static
void for_each_fiber(
const F& fn, const std::tuple<Args...>& args) noexcept;
static
std::bitset<warpSize>& predicate() noexcept;
template<typename T, std::size_t n = warpSize>
static
decltype(auto) scratchpad() noexcept;
std::array<T, n>& scratchpad() noexcept;
static
const Tile& this_tile() noexcept;

Expand Down Expand Up @@ -164,15 +168,19 @@ namespace hip
Fiber::this_fiber_().set_id_(0);
}

inline
std::bitset<warpSize>& Tile::predicate() noexcept
{
return scratchpad<std::bitset<warpSize>, 1>()[0];
}

template<typename T, std::size_t n>
inline
decltype(auto) Tile::scratchpad() noexcept
std::array<T, n>& Tile::scratchpad() noexcept
{ // TODO: use named variable for maximum block size.
thread_local static T r[1024 / warpSize][n];

const auto widx{id(hip::detail::Fiber::this_fiber()) / warpSize};
thread_local static std::array<T, n> r[1024 / warpSize];

return (r[widx]);
return (r[id(Fiber::this_fiber()) / warpSize]);
}

inline
Expand Down
2 changes: 2 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ add_executable(
legacy_tests
catch_main.cpp
hip_atomics.cpp
hip_device_all_any.cpp
hip_device_ballot.cpp
hip_device_clock.cpp
hip_device_clz.cpp
Expand Down Expand Up @@ -71,6 +72,7 @@ add_executable(
target_link_libraries(legacy_tests PRIVATE tests_common)

add_test(NAME "legacy_atomic" COMMAND legacy_tests [device][atomic])
add_test(NAME "legacy_any_all" COMMAND legacy_tests [device][all][any])
add_test(NAME "legacy_ballot" COMMAND legacy_tests [device][ballot])
add_test(NAME "legacy_clock" COMMAND legacy_tests [device][clock])
add_test(NAME "legacy_clz" COMMAND legacy_tests [device][clz])
Expand Down
Loading

0 comments on commit 4aa3871

Please sign in to comment.