Update HYPRE_WARP_FULL_MASK for HIP builds (#895)

* Update HYPRE_WARP_FULL_MASK to 64-bit length for HIP
* Add hypre_mask type depending on the GPU architecture
* Change unsigned -> hypre_uint. Move a few hypre_int to hypre_uint
This commit is contained in:
Victor A. P. Magri 2023-05-03 06:44:45 -04:00 committed by GitHub
parent e351324df8
commit cfb3ae4e32
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 76 additions and 62 deletions

View File

@ -62,6 +62,15 @@ struct hypre_device_allocator
#if defined(HYPRE_USING_GPU)
/* Data types depending on GPU architecture */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
typedef hypre_uint hypre_mask;
#elif defined(HYPRE_USING_HIP)
typedef hypre_ulonglongint hypre_mask;
#endif
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* cuda includes
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
@ -213,18 +222,16 @@ using hypre_DeviceItem = sycl::nd_item<3>;
#define HYPRE_MAX_NTHREADS_BLOCK 1024
// HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
#if defined(HYPRE_USING_CUDA)
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
#define HYPRE_WARP_SIZE 32
#define HYPRE_WARP_BITSHIFT 5
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
#elif defined(HYPRE_USING_HIP)
#define HYPRE_WARP_SIZE 64
#define HYPRE_WARP_BITSHIFT 6
#elif defined(HYPRE_USING_SYCL)
#define HYPRE_WARP_SIZE 32
#define HYPRE_WARP_BITSHIFT 5
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF
#endif
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
#define HYPRE_MAX_NUM_WARPS (64 * 64 * 32)
#define HYPRE_FLT_LARGE 1e30
#define HYPRE_1D_BLOCK_SIZE 512
@ -981,32 +988,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)
template <typename T>
static __device__ __forceinline__
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl(val, src_line, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_down(val, delta, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_xor(val, lanemask, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
static __device__ __forceinline__
void __syncwarp()
{
@ -1019,13 +1026,13 @@ void __syncwarp()
// with these overloads for CUDA, just for HIP.
#if defined(HYPRE_USING_HIP)
static __device__ __forceinline__
hypre_int __any_sync(unsigned mask, hypre_int predicate)
hypre_int __any_sync(hypre_mask mask, hypre_int predicate)
{
return __any(predicate);
}
static __device__ __forceinline__
hypre_int __ballot_sync(unsigned mask, hypre_int predicate)
hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate)
{
return __ballot(predicate);
}
@ -1088,14 +1095,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
}
static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
return __any_sync(mask, predicate);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_sync(mask, val, src_line, width);
@ -1103,7 +1110,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up_sync(mask, val, delta, width);
@ -1111,7 +1118,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_down_sync(mask, val, delta, width);
@ -1119,7 +1126,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_xor_sync(mask, val, lane_mask, width);
@ -1512,14 +1519,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
}
static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
return sycl::any_of_group(item.get_sub_group(), predicate);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line)
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
{
/* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */
item.get_sub_group().barrier();
@ -1528,7 +1535,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1539,14 +1546,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
return sycl::shift_group_right(item.get_sub_group(), val, delta);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1557,14 +1564,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
return sycl::shift_group_left(item.get_sub_group(), val, delta);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1575,14 +1582,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask)
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
{
return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);

View File

@ -10,6 +10,15 @@
#if defined(HYPRE_USING_GPU)
/* Data types depending on GPU architecture */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
typedef hypre_uint hypre_mask;
#elif defined(HYPRE_USING_HIP)
typedef hypre_ulonglongint hypre_mask;
#endif
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* cuda includes
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
@ -161,18 +170,16 @@ using hypre_DeviceItem = sycl::nd_item<3>;
#define HYPRE_MAX_NTHREADS_BLOCK 1024
// HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
#if defined(HYPRE_USING_CUDA)
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
#define HYPRE_WARP_SIZE 32
#define HYPRE_WARP_BITSHIFT 5
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
#elif defined(HYPRE_USING_HIP)
#define HYPRE_WARP_SIZE 64
#define HYPRE_WARP_BITSHIFT 6
#elif defined(HYPRE_USING_SYCL)
#define HYPRE_WARP_SIZE 32
#define HYPRE_WARP_BITSHIFT 5
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF
#endif
#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF
#define HYPRE_MAX_NUM_WARPS (64 * 64 * 32)
#define HYPRE_FLT_LARGE 1e30
#define HYPRE_1D_BLOCK_SIZE 512
@ -929,32 +936,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val)
template <typename T>
static __device__ __forceinline__
T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl(val, src_line, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_down(val, delta, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE)
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_xor(val, lanemask, width);
}
template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up(val, delta, width);
}
static __device__ __forceinline__
void __syncwarp()
{
@ -967,13 +974,13 @@ void __syncwarp()
// with these overloads for CUDA, just for HIP.
#if defined(HYPRE_USING_HIP)
static __device__ __forceinline__
hypre_int __any_sync(unsigned mask, hypre_int predicate)
hypre_int __any_sync(hypre_mask mask, hypre_int predicate)
{
return __any(predicate);
}
static __device__ __forceinline__
hypre_int __ballot_sync(unsigned mask, hypre_int predicate)
hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate)
{
return __ballot(predicate);
}
@ -1036,14 +1043,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
}
static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
return __any_sync(mask, predicate);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_sync(mask, val, src_line, width);
@ -1051,7 +1058,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_up_sync(mask, val, delta, width);
@ -1059,7 +1066,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_down_sync(mask, val, delta, width);
@ -1067,7 +1074,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
hypre_int width = HYPRE_WARP_SIZE)
{
return __shfl_xor_sync(mask, val, lane_mask, width);
@ -1460,14 +1467,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
}
static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate)
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
return sycl::any_of_group(item.get_sub_group(), predicate);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line)
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
{
/* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */
item.get_sub_group().barrier();
@ -1476,7 +1483,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line,
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1487,14 +1494,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
return sycl::shift_group_right(item.get_sub_group(), val, delta);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1505,14 +1512,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta)
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
return sycl::shift_group_left(item.get_sub_group(), val, delta);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta,
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
@ -1523,14 +1530,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask)
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
{
return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
}
template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask,
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
hypre_int width)
{
hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);