From cfb3ae4e32cd3d716d78c6b35e2a893ca0b03868 Mon Sep 17 00:00:00 2001 From: "Victor A. P. Magri" <50467563+victorapm@users.noreply.github.com> Date: Wed, 3 May 2023 06:44:45 -0400 Subject: [PATCH] Update `HYPRE_WARP_FULL_MASK` for HIP builds (#895) * Update HYPRE_WARP_FULL_MASK to 64-bit length for HIP * Add hypre_mask type depending on the GPU architecture * Change unsigned -> hypre_uint. Move a few hypre_int to hypre_uint --- src/utilities/_hypre_utilities.hpp | 69 ++++++++++++++++-------------- src/utilities/device_utils.h | 69 ++++++++++++++++-------------- 2 files changed, 76 insertions(+), 62 deletions(-) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 2117f0aa0..4c43fc87e 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -62,6 +62,15 @@ struct hypre_device_allocator #if defined(HYPRE_USING_GPU) +/* Data types depending on GPU architecture */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL) +typedef hypre_uint hypre_mask; + +#elif defined(HYPRE_USING_HIP) +typedef hypre_ulonglongint hypre_mask; + +#endif + /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * cuda includes * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -213,18 +222,16 @@ using hypre_DeviceItem = sycl::nd_item<3>; #define HYPRE_MAX_NTHREADS_BLOCK 1024 // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE -#if defined(HYPRE_USING_CUDA) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL) #define HYPRE_WARP_SIZE 32 #define HYPRE_WARP_BITSHIFT 5 +#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF #elif defined(HYPRE_USING_HIP) #define HYPRE_WARP_SIZE 64 #define HYPRE_WARP_BITSHIFT 6 -#elif defined(HYPRE_USING_SYCL) -#define HYPRE_WARP_SIZE 32 -#define HYPRE_WARP_BITSHIFT 5 +#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF #endif -#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF #define HYPRE_MAX_NUM_WARPS (64 * 64 * 32) #define HYPRE_FLT_LARGE 1e30 #define HYPRE_1D_BLOCK_SIZE 512 @@ -981,32 +988,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val) template static __device__ __forceinline__ -T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) { return __shfl(val, src_line, width); } template static __device__ __forceinline__ -T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) +{ + return __shfl_up(val, delta, width); +} + +template +static __device__ __forceinline__ +T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_down(val, delta, width); } template static __device__ __forceinline__ -T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_xor(val, lanemask, width); } -template -static __device__ __forceinline__ -T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE) -{ - return __shfl_up(val, delta, width); -} - static __device__ __forceinline__ void __syncwarp() { @@ -1019,13 +1026,13 @@ void __syncwarp() // with these overloads for CUDA, just for HIP. #if defined(HYPRE_USING_HIP) static __device__ __forceinline__ -hypre_int __any_sync(unsigned mask, hypre_int predicate) +hypre_int __any_sync(hypre_mask mask, hypre_int predicate) { return __any(predicate); } static __device__ __forceinline__ -hypre_int __ballot_sync(unsigned mask, hypre_int predicate) +hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate) { return __ballot(predicate); } @@ -1088,14 +1095,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum) } static __device__ __forceinline__ -hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate) +hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate) { return __any_sync(mask, predicate); } template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line, +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_sync(mask, val, src_line, width); @@ -1103,7 +1110,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_up_sync(mask, val, delta, width); @@ -1111,7 +1118,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_down_sync(mask, val, delta, width); @@ -1119,7 +1126,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask, +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_xor_sync(mask, val, lane_mask, width); @@ -1512,14 +1519,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum) } static __device__ __forceinline__ -hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate) +hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate) { return sycl::any_of_group(item.get_sub_group(), predicate); } template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line) +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line) { /* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */ item.get_sub_group().barrier(); @@ -1528,7 +1535,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line, +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1539,14 +1546,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta) +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta) { return sycl::shift_group_right(item.get_sub_group(), val, delta); } template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1557,14 +1564,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta) +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta) { return sycl::shift_group_left(item.get_sub_group(), val, delta); } template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1575,14 +1582,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask) +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask) { return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask); } template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask, +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 5133b6d2b..b7a398fac 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -10,6 +10,15 @@ #if defined(HYPRE_USING_GPU) +/* Data types depending on GPU architecture */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL) +typedef hypre_uint hypre_mask; + +#elif defined(HYPRE_USING_HIP) +typedef hypre_ulonglongint hypre_mask; + +#endif + /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * cuda includes * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -161,18 +170,16 @@ using hypre_DeviceItem = sycl::nd_item<3>; #define HYPRE_MAX_NTHREADS_BLOCK 1024 // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE -#if defined(HYPRE_USING_CUDA) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL) #define HYPRE_WARP_SIZE 32 #define HYPRE_WARP_BITSHIFT 5 +#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF #elif defined(HYPRE_USING_HIP) #define HYPRE_WARP_SIZE 64 #define HYPRE_WARP_BITSHIFT 6 -#elif defined(HYPRE_USING_SYCL) -#define HYPRE_WARP_SIZE 32 -#define HYPRE_WARP_BITSHIFT 5 +#define HYPRE_WARP_FULL_MASK 0xFFFFFFFFFFFFFFF #endif -#define HYPRE_WARP_FULL_MASK 0xFFFFFFFF #define HYPRE_MAX_NUM_WARPS (64 * 64 * 32) #define HYPRE_FLT_LARGE 1e30 #define HYPRE_1D_BLOCK_SIZE 512 @@ -929,32 +936,32 @@ hypre_double atomicAdd(hypre_double* address, hypre_double val) template static __device__ __forceinline__ -T __shfl_sync(unsigned mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) { return __shfl(val, src_line, width); } template static __device__ __forceinline__ -T __shfl_down_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) +{ + return __shfl_up(val, delta, width); +} + +template +static __device__ __forceinline__ +T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_down(val, delta, width); } template static __device__ __forceinline__ -T __shfl_xor_sync(unsigned mask, T val, unsigned lanemask, hypre_int width = HYPRE_WARP_SIZE) +T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_xor(val, lanemask, width); } -template -static __device__ __forceinline__ -T __shfl_up_sync(unsigned mask, T val, unsigned delta, hypre_int width = HYPRE_WARP_SIZE) -{ - return __shfl_up(val, delta, width); -} - static __device__ __forceinline__ void __syncwarp() { @@ -967,13 +974,13 @@ void __syncwarp() // with these overloads for CUDA, just for HIP. #if defined(HYPRE_USING_HIP) static __device__ __forceinline__ -hypre_int __any_sync(unsigned mask, hypre_int predicate) +hypre_int __any_sync(hypre_mask mask, hypre_int predicate) { return __any(predicate); } static __device__ __forceinline__ -hypre_int __ballot_sync(unsigned mask, hypre_int predicate) +hypre_int __ballot_sync(hypre_mask mask, hypre_int predicate) { return __ballot(predicate); } @@ -1036,14 +1043,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum) } static __device__ __forceinline__ -hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate) +hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate) { return __any_sync(mask, predicate); } template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line, +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_sync(mask, val, src_line, width); @@ -1051,7 +1058,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_up_sync(mask, val, delta, width); @@ -1059,7 +1066,7 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_down_sync(mask, val, delta, width); @@ -1067,7 +1074,7 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask, +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask, hypre_int width = HYPRE_WARP_SIZE) { return __shfl_xor_sync(mask, val, lane_mask, width); @@ -1460,14 +1467,14 @@ T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum) } static __device__ __forceinline__ -hypre_int warp_any_sync(hypre_DeviceItem &item, unsigned mask, hypre_int predicate) +hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate) { return sycl::any_of_group(item.get_sub_group(), predicate); } template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line) +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line) { /* WM: todo - I'm still getting bad results if I try to remove this barrier. Needs investigation. */ item.get_sub_group().barrier(); @@ -1476,7 +1483,7 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_line, +T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1487,14 +1494,14 @@ T warp_shuffle_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int src_ template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta) +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta) { return sycl::shift_group_right(item.get_sub_group(), val, delta); } template static __device__ __forceinline__ -T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1505,14 +1512,14 @@ T warp_shuffle_up_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int d template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta) +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta) { return sycl::shift_group_left(item.get_sub_group(), val, delta); } template static __device__ __forceinline__ -T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int delta, +T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item); @@ -1523,14 +1530,14 @@ T warp_shuffle_down_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask) +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask) { return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask); } template static __device__ __forceinline__ -T warp_shuffle_xor_sync(hypre_DeviceItem &item, unsigned mask, T val, hypre_int lane_mask, +T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask, hypre_int width) { hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);