diff --git a/include/hip/hip_api.h b/include/hip/hip_api.h
index a03717c..48e3fdb 100644
--- a/include/hip/hip_api.h
+++ b/include/hip/hip_api.h
@@ -25,6 +25,18 @@
 #include <utility>
 
 // BEGIN INTRINSICS
+inline
+std::int32_t __all(std::int32_t predicate) noexcept
+{
+    return hip::detail::all(predicate);
+}
+
+inline
+std::int32_t __any(std::int32_t predicate) noexcept
+{
+    return hip::detail::any(predicate);
+}
+
 inline
 std::uint64_t __ballot(std::int32_t predicate) noexcept
 {
@@ -113,7 +125,7 @@ template<
     typename T,
     std::enable_if_t<
         (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-        (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        sizeof(T) <= 8>* = nullptr>
 inline
 T __shfl(T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept
 {
@@ -124,7 +136,7 @@ template<
     typename T,
     std::enable_if_t<
         (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-        (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        sizeof(T) <= 8>* = nullptr>
 inline
 T __shfl_down(
     T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept
@@ -136,7 +148,7 @@ template<
     typename T,
     std::enable_if_t<
         (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-        (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        sizeof(T) <= 8>* = nullptr>
 inline
 T __shfl_up(
     T var, std::uint32_t delta, std::int32_t width = warpSize) noexcept
@@ -148,7 +160,7 @@ template<
     typename T,
     std::enable_if_t<
         (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-        (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        sizeof(T) <= 8>* = nullptr>
 inline
 T __shfl_xor(
     T var, std::int32_t src_lane, std::int32_t width = warpSize) noexcept
diff --git a/src/include/hip/detail/intrinsics.hpp b/src/include/hip/detail/intrinsics.hpp
index f8a4c40..be41516 100644
--- a/src/include/hip/detail/intrinsics.hpp
+++ b/src/include/hip/detail/intrinsics.hpp
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
  * See 'LICENSE' in the project root for license information.
  * -------------------------------------------------------------------------- */
 #pragma once
@@ -31,13 +31,65 @@ namespace hip
         std::uint64_t ballot(std::int32_t x) noexcept
         {
             const auto tidx{id(Fiber::this_fiber()) % warpSize};
-            auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
 
-            lds[tidx] = static_cast<bool>(x);
+            Tile::predicate()[tidx] = x;
 
-            barrier(Tile::this_tile());
+            Tile::this_tile().barrier();
+
+            const auto r{Tile::predicate().to_ullong()};
+
+            Tile::this_tile().barrier();
 
-            return lds.to_ullong();
+            return r;
+        }
+
+        template<typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+        inline
+        std::uint32_t pop_count(T x) noexcept
+        {
+            [[maybe_unused]]
+            constexpr auto popcnt{[](auto&& x) constexpr noexcept {
+                return std::bitset<sizeof(T) * CHAR_BIT>(x).count();
+            }};
+
+            if constexpr (sizeof(T) == sizeof(std::uint32_t)) {
+                #if defined(_MSC_VER)
+                    return __popcnt(x);
+                #elif defined(__has_builtin)
+                    #if __has_builtin(__builtin_popcount)
+                        return __builtin_popcount(x);
+                    #else
+                        return popcnt(x);
+                    #endif
+                #else
+                    return popcnt(x);
+                #endif
+            }
+            else {
+                #if defined(_MSC_VER)
+                    return static_cast<std::uint32_t>(__popcnt64(x));
+                #elif defined(__has_builtin)
+                    #if __has_builtin(__builtin_popcountll)
+                        return __builtin_popcountll(x);
+                    #else
+                        return popcnt(x);
+                    #endif
+                #else
+                    return popcnt(x);
+                #endif
+            }
+        }
+
+        inline
+        std::int32_t all(std::int32_t x) noexcept
+        {
+            return pop_count(ballot(x)) == warpSize;
+        }
+
+        inline
+        std::int32_t any(std::int32_t x) noexcept
+        {
+            return pop_count(ballot(x)) > 0;
         }
 
         template<
@@ -151,48 +203,7 @@ namespace hip
             }
         }
 
-        template<typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-        inline
-        std::uint32_t pop_count(T x) noexcept
-        {
-            [[maybe_unused]]
-            constexpr auto popcnt{[](auto&& x) constexpr noexcept {
-                return std::bitset<sizeof(T) * CHAR_BIT>(x).count();
-            }};
-
-            if constexpr (sizeof(T) == sizeof(std::uint32_t)) {
-                #if defined(_MSC_VER)
-                    return __popcnt(x);
-                #elif defined(__has_builtin)
-                    #if __has_builtin(__builtin_popcount)
-                        return __builtin_popcount(x);
-                    #else
-                        return popcnt(x);
-                    #endif
-                #else
-                    return popcnt(x);
-                #endif
-            }
-            else {
-                #if defined(_MSC_VER)
-                    return static_cast<std::uint32_t>(__popcnt64(x));
-                #elif defined(__has_builtin)
-                    #if __has_builtin(__builtin_popcountll)
-                        return __builtin_popcountll(x);
-                    #else
-                        return popcnt(x);
-                    #endif
-                #else
-                    return popcnt(x);
-                #endif
-            }
-        }
-
-        template<
-            typename T,
-            std::enable_if_t<
-                (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-                (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        template<typename T>
         inline
         T shuffle(T x, std::int32_t src, std::int32_t w) noexcept
         {
@@ -203,18 +214,19 @@ namespace hip
             Tile::this_tile().barrier();
 
             const auto sidx{(tidx / w * w) + src};
+            const auto r{
+                (src < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};
+
+            Tile::this_tile().barrier();
 
-            return (src < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
+            return r;
         }
 
-        template<
-            typename T,
-            std::enable_if_t<
-                (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-                (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        template<typename T>
         inline
         T shuffle_down(T x, std::int32_t dx, std::int32_t w) noexcept
         {   // TODO: incorrect with large negative offsets, revisit.
+            // TODO: should probably consider using partial barriers.
             const auto tidx{id(Fiber::this_fiber()) % warpSize};
 
             Tile::scratchpad<T>()[tidx] = x;
@@ -222,18 +234,19 @@ namespace hip
             Tile::this_tile().barrier();
 
             const auto sidx{(tidx / w * w) + (tidx % w) + dx};
+            const auto r{
+                (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};
+
+            Tile::this_tile().barrier();
 
-            return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
+            return r;
         }
 
-        template<
-            typename T,
-            std::enable_if_t<
-                (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-                (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        template<typename T>
         inline
         T shuffle_up(T x, std::int32_t dx, std::int32_t w) noexcept
         {   // TODO: incorrect with large negative offsets, revisit.
+            // TODO: should probably consider using partial barriers.
             const auto tidx{id(Fiber::this_fiber()) % warpSize};
 
             Tile::scratchpad<T>()[tidx] = x;
@@ -241,18 +254,19 @@ namespace hip
             Tile::this_tile().barrier();
 
             const auto sidx{(tidx / w * w) + (tidx % w) - dx};
+            const auto r{
+                (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx]};
 
-            return (sidx < 0 || sidx >= w) ? x : Tile::scratchpad<T>()[sidx];
+            Tile::this_tile().barrier();
+
+            return r;
         }
 
-        template<
-            typename T,
-            std::enable_if_t<
-                (std::is_integral_v<T> || std::is_floating_point_v<T>) &&
-                (sizeof(T) >= 4 && sizeof(T) <= 8)>* = nullptr>
+        template<typename T>
         inline
         T shuffle_xor(T x, std::int32_t src, std::int32_t w) noexcept
         {   // TODO: probably incorrect, revisit.
+            // TODO: should probably consider using partial barriers.
             const auto tidx{id(Fiber::this_fiber()) % warpSize};
 
             Tile::scratchpad<T>()[tidx] = x;
@@ -260,8 +274,11 @@ namespace hip
             Tile::this_tile().barrier();
 
             const auto sidx{((tidx / w * w) + (tidx % w)) ^ src};
+            const auto r{(src < 0) ? x : Tile::scratchpad<T>()[sidx]};
+
+            Tile::this_tile().barrier();
 
-            return (src < 0) ? x : Tile::scratchpad<T>()[sidx];
+            return r;
         }
 
         inline
diff --git a/src/include/hip/detail/tile.hpp b/src/include/hip/detail/tile.hpp
index 9d0afc9..e52a201 100644
--- a/src/include/hip/detail/tile.hpp
+++ b/src/include/hip/detail/tile.hpp
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
  * See 'LICENSE' in the project root for license information.
  * -------------------------------------------------------------------------- */
 #pragma once
@@ -16,6 +16,8 @@
 #include "../../../../include/hip/hip_constants.h"
 
 #include <algorithm>
+#include <array>
+#include <bitset>
 #include <execution>
 #include <functional>
 #include <cstdint>
@@ -101,9 +103,11 @@ namespace hip
             static
             void for_each_fiber(
                 const F& fn, const std::tuple<Args...>& args) noexcept;
+            static
+            std::bitset<warpSize>& predicate() noexcept;
             template<typename T, std::size_t n = warpSize>
             static
-            decltype(auto) scratchpad() noexcept;
+            std::array<T, n>& scratchpad() noexcept;
             static
             const Tile& this_tile() noexcept;
 
@@ -164,15 +168,19 @@ namespace hip
             Fiber::this_fiber_().set_id_(0);
         }
 
+        inline
+        std::bitset<warpSize>& Tile::predicate() noexcept
+        {
+            return scratchpad<std::bitset<warpSize>, 1>()[0];
+        }
+
         template<typename T, std::size_t n>
         inline
-        decltype(auto) Tile::scratchpad() noexcept
+        std::array<T, n>& Tile::scratchpad() noexcept
         {   // TODO: use named variable for maximum block size.
-            thread_local static T r[1024 / warpSize][n];
-
-            const auto widx{id(hip::detail::Fiber::this_fiber()) / warpSize};
+            thread_local static std::array<T, n> r[1024 / warpSize];
 
-            return (r[widx]);
+            return (r[id(Fiber::this_fiber()) / warpSize]);
         }
 
         inline
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b0d8239..6b5d72b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -31,6 +31,7 @@ add_executable(
     legacy_tests
         catch_main.cpp
         hip_atomics.cpp
+        hip_device_all_any.cpp
         hip_device_ballot.cpp
         hip_device_clock.cpp
         hip_device_clz.cpp
@@ -71,6 +72,7 @@ add_executable(
 target_link_libraries(legacy_tests PRIVATE tests_common)
 
 add_test(NAME "legacy_atomic" COMMAND legacy_tests [device][atomic])
+add_test(NAME "legacy_any_all" COMMAND legacy_tests [device][all][any])
 add_test(NAME "legacy_ballot" COMMAND legacy_tests [device][ballot])
 add_test(NAME "legacy_clock" COMMAND legacy_tests [device][clock])
 add_test(NAME "legacy_clz" COMMAND legacy_tests [device][clz])
diff --git a/tests/hip_device_all_any.cpp b/tests/hip_device_all_any.cpp
new file mode 100644
index 0000000..c8910c9
--- /dev/null
+++ b/tests/hip_device_all_any.cpp
@@ -0,0 +1,101 @@
+/* -----------------------------------------------------------------------------
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+ * See 'LICENSE' in the project root for license information.
+ * -------------------------------------------------------------------------- */
+#include <hip/hip_runtime.h>
+
+#include "../external/catch2/catch.hpp"
+
+#include <algorithm>
+#include <cstdlib>
+
+using namespace std;
+
+__global__
+void warpvote(int* device_any, int* device_all, int pshift)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    device_any[threadIdx.x >> pshift] = __any(tid - 77);
+    device_all[threadIdx.x >> pshift] = __all(tid - 77);
+}
+
+TEST_CASE("Unit_AnyAll_CompileTest", "[device][all][any]")
+{
+    auto w{warpSize};
+    auto pshift{0};
+
+    while (w >>= 1) ++pshift;
+
+    INFO("WarpSize: " << warpSize << " pShift: " << pshift);
+
+    auto anycount{0};
+    auto allcount{0};
+    auto Num_Threads_per_Block{1024};
+    auto Num_Blocks_per_Grid{1};
+    auto Num_Warps_per_Grid{
+        (Num_Threads_per_Block * Num_Blocks_per_Grid) / warpSize};
+
+    auto host_any{static_cast<int*>(malloc(Num_Warps_per_Grid * sizeof(int)))};
+    auto host_all{static_cast<int*>(malloc(Num_Warps_per_Grid * sizeof(int)))};
+
+    int* device_any{};
+    int* device_all{};
+
+    REQUIRE(
+        hipMalloc(&device_any, Num_Warps_per_Grid * sizeof(int)) == hipSuccess);
+    REQUIRE(
+        hipMalloc(&device_all, Num_Warps_per_Grid * sizeof(int)) == hipSuccess);
+
+    fill_n(host_any, Num_Warps_per_Grid, 0);
+    fill_n(host_all, Num_Warps_per_Grid, 0);
+
+    REQUIRE(hipMemcpy(
+        device_any,
+        host_any,
+        sizeof(int),
+        hipMemcpyHostToDevice) == hipSuccess);
+    REQUIRE(hipMemcpy(
+        device_all,
+        host_all,
+        sizeof(int),
+        hipMemcpyHostToDevice) == hipSuccess);
+
+    hipLaunchKernelGGL(
+        warpvote,
+        dim3(Num_Blocks_per_Grid),
+        dim3(Num_Threads_per_Block),
+        0,
+        nullptr,
+        device_any,
+        device_all,
+        pshift);
+
+    REQUIRE(hipGetLastError() == hipSuccess);
+    REQUIRE(hipMemcpy(
+        host_any,
+        device_any,
+        Num_Warps_per_Grid * sizeof(int),
+        hipMemcpyDeviceToHost) == hipSuccess);
+    REQUIRE(hipMemcpy(
+        host_all,
+        device_all,
+        Num_Warps_per_Grid * sizeof(int),
+        hipMemcpyDeviceToHost) == hipSuccess);
+
+    for (int i = 0; i < Num_Warps_per_Grid; i++) {
+        INFO(
+            "Warp Number: " << i << " __any: " << host_any[i]
+                << " __all: " << host_all[i]);
+
+        if (host_all[i] != 1) ++allcount;
+        if (host_any[i] != 1) {
+            ++anycount;
+        }
+    }
+
+    REQUIRE(hipFree(device_any) == hipSuccess);
+    REQUIRE(hipFree(device_all) == hipSuccess);
+
+    REQUIRE(anycount == 0);
+    REQUIRE(allcount == 1);
+}
\ No newline at end of file
diff --git a/tests/hip_device_shfl.cpp b/tests/hip_device_shfl.cpp
index 04f52b7..cd47c07 100644
--- a/tests/hip_device_shfl.cpp
+++ b/tests/hip_device_shfl.cpp
@@ -1,7 +1,8 @@
 /* -----------------------------------------------------------------------------
- * Copyright (c) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
  * See 'LICENSE' in the project root for license information.
  * -------------------------------------------------------------------------- */
+#include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
 
 #include "../external/catch2/catch.hpp"
@@ -128,4 +129,216 @@ TEMPLATE_TEST_CASE(
     REQUIRE(hipFree(GPUTransposeMatrix) == hipSuccess);
 
     // TODO: add tests for OOB indices.
+}
+
+template<typename T>
+__global__
+void shflDownSum(T* a, int size)
+{
+    T val = a[threadIdx.x];
+    for (int i = size / 2; i > 0; i /= 2) {
+        val += __shfl_down(val, i, size);
+    }
+    a[threadIdx.x] = val;
+}
+
+template<typename T>
+__global__
+void shflUpSum(T* a, int size)
+{
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_up(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template<typename T>
+__global__
+void shflXorSum(T* a, int size)
+{
+  T val = a[threadIdx.x];
+  for (int i = size/2; i > 0; i /= 2) {
+    val += __shfl_xor(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+inline
+void getFactor(int* fact)
+{
+    *fact = 101;
+}
+inline
+void getFactor(unsigned int* fact)
+{
+    *fact = static_cast<unsigned int>(INT32_MAX) + 1;
+}
+inline
+void getFactor(float* fact)
+{
+    *fact = 2.5;
+}
+inline
+void getFactor(double* fact)
+{
+    *fact = 2.5;
+}
+inline
+void getFactor(__half* fact)
+{
+    *fact = 2.5;
+}
+inline
+void getFactor(int64_t* fact)
+{
+    *fact = 303;
+}
+inline void getFactor(uint64_t* fact)
+{
+    *fact = static_cast<uint64_t>(__LONG_LONG_MAX__) + 1;
+}
+
+constexpr int sz{32};
+
+template<typename T>
+inline
+T sum(T* a)
+{
+    T cpuSum = 0;
+    T factor;
+    getFactor(&factor);
+    for (int i = 0; i < sz; i++) {
+        a[i] = i + factor;
+        cpuSum += a[i];
+    }
+    return cpuSum;
+}
+
+template<>
+inline
+__half sum(__half* a)
+{
+    __half cpuSum{0};
+    __half factor;
+    getFactor(&factor);
+    for (int i = 0; i < sz; i++) {
+        a[i] = i + __half2float(factor);
+        cpuSum = __half2float(cpuSum) + __half2float(a[i]);
+    }
+    return cpuSum;
+}
+
+template<typename T>
+inline
+bool compare(T gpuSum, T cpuSum)
+{
+    if (gpuSum != cpuSum) {
+        return true;
+    }
+    return false;
+}
+
+template<>
+inline
+bool compare(__half gpuSum, __half cpuSum)
+{
+    if (__half2float(gpuSum) != __half2float(cpuSum)) {
+        return true;
+    }
+    return false;
+}
+
+template<typename T>
+inline
+void runTestShflUp()
+{
+    constexpr int size{32};
+    T a[size];
+    T cpuSum = sum(a);
+    T* d_a{};
+
+    REQUIRE(hipMalloc(&d_a, sizeof(T) * size) == hipSuccess);
+    REQUIRE(
+        hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault) == hipSuccess);
+
+    hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
+
+    REQUIRE(
+        hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault) == hipSuccess);
+    REQUIRE((compare(a[size - 1], cpuSum)) == 0);
+    REQUIRE(hipFree(d_a) == hipSuccess);
+}
+
+template<typename T>
+inline
+void runTestShflDown()
+{
+    T a[sz];
+    T cpuSum = sum(a);
+    T* d_a;
+
+    REQUIRE(hipMalloc(&d_a, sizeof(T) * sz) == hipSuccess);
+    REQUIRE(hipMemcpy(d_a, &a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess);
+
+    hipLaunchKernelGGL(shflDownSum<T>, 1, sz, 0, 0, d_a, sz);
+
+    REQUIRE(hipMemcpy(&a, d_a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess);
+    REQUIRE((compare(a[0], cpuSum)) == 0);
+    REQUIRE(hipFree(d_a) == hipSuccess);
+}
+
+template<typename T>
+inline
+void runTestShflXor()
+{
+    T a[sz];
+    T cpuSum = sum(a);
+    T* d_a;
+
+    REQUIRE(hipMalloc(&d_a, sizeof(T) * sz) == hipSuccess);
+    REQUIRE(hipMemcpy(d_a, &a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess);
+
+    hipLaunchKernelGGL(shflXorSum<T>, 1, sz, 0, 0, d_a, sz);
+
+    REQUIRE(hipMemcpy(&a, d_a, sizeof(T) * sz, hipMemcpyDefault) == hipSuccess);
+    REQUIRE((compare(a[0], cpuSum)) == 0);
+    REQUIRE(hipFree(d_a) == hipSuccess);
+}
+
+TEST_CASE("Unit_runTestShfl_up", "[device][shfl]")
+{
+    SECTION("runTestShflUp for int") { runTestShflUp<int>(); }
+    SECTION("runTestShflUp for float") { runTestShflUp<float>(); }
+    SECTION("runTestShflUp for double") { runTestShflUp<double>(); }
+    SECTION("runTestShflUp for __half") { runTestShflUp<__half>(); }
+    SECTION("runTestShflUp for int64_t") { runTestShflUp<int64_t>(); }
+    SECTION("runTestShflUp for unsigned int") { runTestShflUp<unsigned int>(); }
+    SECTION("runTestShflUp for uint64_t") { runTestShflUp<uint64_t>(); }
+}
+
+TEST_CASE("Unit_runTestShfl_Down", "[device][shfl]")
+{
+    SECTION("runTestShflDown for int") { runTestShflDown<int>(); }
+    SECTION("runTestShflDown for float") { runTestShflDown<float>(); }
+    SECTION("runTestShflDown for double") { runTestShflDown<double>(); }
+    SECTION("runTestShflDown for __half") { runTestShflDown<__half>(); }
+    SECTION("runTestShflDown for int64_t") { runTestShflDown<int64_t>(); }
+    SECTION("runTestShflDown for unsigned int") {
+        runTestShflDown<unsigned int>();
+    }
+    SECTION("runTestShflDown for uint64_t") { runTestShflDown<uint64_t>(); }
+}
+
+TEST_CASE("Unit_runTestShfl_Xor", "[device][shfl]")
+{
+    SECTION("runTestShflXor for int") { runTestShflXor<int>(); }
+    SECTION("runTestShflXor for float") { runTestShflXor<float>(); }
+    SECTION("runTestShflXor for double") { runTestShflXor<double>(); }
+    SECTION("runTestShflXor for __half") { runTestShflXor<__half>(); }
+    SECTION("runTestShflXor for int64_t") { runTestShflXor<int64_t>(); }
+    SECTION("runTestShflXor for unsigned int") {
+        runTestShflXor<unsigned int>();
+    }
+    SECTION("runTestShflXor for uint64_t") { runTestShflXor<uint64_t>(); }
 }
\ No newline at end of file