Skip to content

Commit

Permalink
Use HYPRE_WARP_SIZE instead of hardcoded numbers
Browse files Browse the repository at this point in the history
This hits the loop and if statements.
  • Loading branch information
pbauman committed Jul 23, 2020
1 parent b075d64 commit 950a747
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 29 deletions.
28 changes: 14 additions & 14 deletions src/utilities/_hypre_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,28 +373,28 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)

template <typename T>
static __device__ __forceinline__
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width=32)
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl(val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width=32)
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_down(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width=32)
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_xor(val, lanemask, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width=32)
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
Expand All @@ -419,7 +419,7 @@ static __device__ __forceinline__
T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
{
#pragma unroll
for (hypre_int d = 2; d <= 32; d <<= 1)
for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
{
T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
if ( (lane_id & (d - 1)) == d - 1 )
Expand All @@ -428,15 +428,15 @@ T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
}
}

all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, 31);
all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);

if (lane_id == 31)
if (lane_id == HYPRE_WARP_SIZE-1)
{
in = 0;
}

#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);

Expand All @@ -460,7 +460,7 @@ static __device__ __forceinline__
T warp_reduce_sum(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in += __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d);
}
Expand All @@ -472,7 +472,7 @@ static __device__ __forceinline__
T warp_allreduce_sum(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
}
Expand All @@ -484,7 +484,7 @@ static __device__ __forceinline__
T warp_reduce_max(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -496,7 +496,7 @@ static __device__ __forceinline__
T warp_allreduce_max(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -508,7 +508,7 @@ static __device__ __forceinline__
T warp_reduce_min(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -520,7 +520,7 @@ static __device__ __forceinline__
T warp_allreduce_min(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand Down
2 changes: 1 addition & 1 deletion src/utilities/hypre_cuda_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ hypre_GetDefaultCUDAGridDimension( HYPRE_Int n,
{
HYPRE_Int num_warps_per_block = num_threads_per_block >> 5;

hypre_assert(num_warps_per_block * 32 == num_threads_per_block);
hypre_assert(num_warps_per_block * HYPRE_WARP_SIZE == num_threads_per_block);

num_blocks = (n + num_warps_per_block - 1) / num_warps_per_block;
}
Expand Down
28 changes: 14 additions & 14 deletions src/utilities/hypre_cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -367,28 +367,28 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)

template <typename T>
static __device__ __forceinline__
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width=32)
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl(val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width=32)
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_down(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width=32)
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_xor(val, lanemask, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width=32)
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width=HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
Expand All @@ -413,7 +413,7 @@ static __device__ __forceinline__
T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
{
#pragma unroll
for (hypre_int d = 2; d <= 32; d <<= 1)
for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
{
T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
if ( (lane_id & (d - 1)) == d - 1 )
Expand All @@ -422,15 +422,15 @@ T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
}
}

all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, 31);
all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);

if (lane_id == 31)
if (lane_id == HYPRE_WARP_SIZE-1)
{
in = 0;
}

#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);

Expand All @@ -454,7 +454,7 @@ static __device__ __forceinline__
T warp_reduce_sum(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in += __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d);
}
Expand All @@ -466,7 +466,7 @@ static __device__ __forceinline__
T warp_allreduce_sum(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
}
Expand All @@ -478,7 +478,7 @@ static __device__ __forceinline__
T warp_reduce_max(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -490,7 +490,7 @@ static __device__ __forceinline__
T warp_allreduce_max(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -502,7 +502,7 @@ static __device__ __forceinline__
T warp_reduce_min(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand All @@ -514,7 +514,7 @@ static __device__ __forceinline__
T warp_allreduce_min(T in)
{
#pragma unroll
for (hypre_int d = 16; d > 0; d >>= 1)
for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
{
in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
}
Expand Down

0 comments on commit 950a747

Please sign in to comment.