Update HYPRE_WARP_FULL_MASK for HIP builds (#895)
* Update HYPRE_WARP_FULL_MASK to 64-bit length for HIP * Add hypre_mask type depending on the GPU architecture * Change unsigned -> hypre_uint. Move a few hypre_int to hypre_uint
This commit is contained in:
parent
e351324df8
commit
cfb3ae4e32
@ -62,6 +62,15 @@ struct hypre_device_allocator
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
|
||||
/* Data types depending on GPU architecture */
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
|
||||
typedef hypre_uint hypre_mask;
|
||||
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
typedef hypre_ulonglongint hypre_mask;
|
||||
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* cuda includes
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -213,18 +222,16 @@ using hypre_DeviceItem = sycl::nd_item<3>;
|
||||
#define HYPRE_MAX_NTHREADS_BLOCK 1024
|
||||
|
||||
// HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
|
||||
#if defined(HYPRE_USING_CUDA)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
|
||||
#define HYPRE_WARP_SIZE 32
|
||||
#define HYPRE_WARP_BITSHIFT 5
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
#define HYPRE_WARP_SIZE 64
|
||||
#define HYPRE_WARP_BITSHIFT 6
|
||||
#elif defined(HYPRE_USING_SYCL)
|
||||
#define HYPRE_WARP_SIZE 32
|
||||
#define HYPRE_WARP_BITSHIFT 5
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF
|
||||
#endif
|
||||
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
|
||||
#define HYPRE_MAX_NUM_WARPS (64 * 64 * 32)
|
||||
#define HYPRE_FLT_LARGE 1e30
|
||||
#define HYPRE_1D_BLOCK_SIZE 512
|
||||
@ -981,32 +988,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl(val, src_line, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up(val, delta, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_down(val, delta, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_xor(val, lanemask, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up(val, delta, width);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void __syncwarp()
|
||||
{
|
||||
@ -1019,13 +1026,13 @@ void __syncwarp()
|
||||
// with these overloads for CUDA, just for HIP.
|
||||
#if defined(HYPRE_USING_HIP)
|
||||
static __device__ __forceinline__
|
||||
hypre_int __any_sync(unsigned mask, hypre_int predicate)
|
||||
hypre_int __any_sync(hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __any(predicate);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int __ballot_sync(unsigned mask, hypre_int predicate)
|
||||
hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __ballot(predicate);
|
||||
}
|
||||
@ -1088,14 +1095,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __any_sync(mask, predicate);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_sync(mask, val, src_line, width);
|
||||
@ -1103,7 +1110,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up_sync(mask, val, delta, width);
|
||||
@ -1111,7 +1118,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_down_sync(mask, val, delta, width);
|
||||
@ -1119,7 +1126,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_xor_sync(mask, val, lane_mask, width);
|
||||
@ -1512,14 +1519,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return sycl::any_of_group(item.get_sub_group(), predicate);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line)
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
|
||||
{
|
||||
/* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */
|
||||
item.get_sub_group().barrier();
|
||||
@ -1528,7 +1535,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1539,14 +1546,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
|
||||
{
|
||||
return sycl::shift_group_right(item.get_sub_group(), val, delta);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1557,14 +1564,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
|
||||
{
|
||||
return sycl::shift_group_left(item.get_sub_group(), val, delta);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1575,14 +1582,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask)
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
|
||||
{
|
||||
return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
|
||||
@ -10,6 +10,15 @@
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
|
||||
/* Data types depending on GPU architecture */
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
|
||||
typedef hypre_uint hypre_mask;
|
||||
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
typedef hypre_ulonglongint hypre_mask;
|
||||
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* cuda includes
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -161,18 +170,16 @@ using hypre_DeviceItem = sycl::nd_item<3>;
|
||||
#define HYPRE_MAX_NTHREADS_BLOCK 1024
|
||||
|
||||
// HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
|
||||
#if defined(HYPRE_USING_CUDA)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
|
||||
#define HYPRE_WARP_SIZE 32
|
||||
#define HYPRE_WARP_BITSHIFT 5
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
#define HYPRE_WARP_SIZE 64
|
||||
#define HYPRE_WARP_BITSHIFT 6
|
||||
#elif defined(HYPRE_USING_SYCL)
|
||||
#define HYPRE_WARP_SIZE 32
|
||||
#define HYPRE_WARP_BITSHIFT 5
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF
|
||||
#endif
|
||||
|
||||
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
|
||||
#define HYPRE_MAX_NUM_WARPS (64 * 64 * 32)
|
||||
#define HYPRE_FLT_LARGE 1e30
|
||||
#define HYPRE_1D_BLOCK_SIZE 512
|
||||
@ -929,32 +936,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl(val, src_line, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up(val, delta, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_down(val, delta, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE)
|
||||
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_xor(val, lanemask, width);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up(val, delta, width);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void __syncwarp()
|
||||
{
|
||||
@ -967,13 +974,13 @@ void __syncwarp()
|
||||
// with these overloads for CUDA, just for HIP.
|
||||
#if defined(HYPRE_USING_HIP)
|
||||
static __device__ __forceinline__
|
||||
hypre_int __any_sync(unsigned mask, hypre_int predicate)
|
||||
hypre_int __any_sync(hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __any(predicate);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int __ballot_sync(unsigned mask, hypre_int predicate)
|
||||
hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __ballot(predicate);
|
||||
}
|
||||
@ -1036,14 +1043,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return __any_sync(mask, predicate);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_sync(mask, val, src_line, width);
|
||||
@ -1051,7 +1058,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_up_sync(mask, val, delta, width);
|
||||
@ -1059,7 +1066,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_down_sync(mask, val, delta, width);
|
||||
@ -1067,7 +1074,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
|
||||
hypre_int width = HYPRE_WARP_SIZE)
|
||||
{
|
||||
return __shfl_xor_sync(mask, val, lane_mask, width);
|
||||
@ -1460,14 +1467,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
|
||||
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
|
||||
{
|
||||
return sycl::any_of_group(item.get_sub_group(), predicate);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line)
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
|
||||
{
|
||||
/* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */
|
||||
item.get_sub_group().barrier();
|
||||
@ -1476,7 +1483,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
|
||||
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1487,14 +1494,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
|
||||
{
|
||||
return sycl::shift_group_right(item.get_sub_group(), val, delta);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1505,14 +1512,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
|
||||
{
|
||||
return sycl::shift_group_left(item.get_sub_group(), val, delta);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
|
||||
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
@ -1523,14 +1530,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask)
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
|
||||
{
|
||||
return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
|
||||
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
|
||||
hypre_int width)
|
||||
{
|
||||
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user