Fixes for Rocm 5.4.3 (#902)

* Use unroll_factor=8 for rocm-5.4.3
* Add SortCSRRocsparse back
* Fix Wunused-variable warnings
* Set _hypre_memory_tracker to NULL after destroy
* Update tioga results after changing default rocm version to 5.2.0
This commit is contained in:
Victor A. P. Magri 2023-05-11 09:05:26 -04:00 committed by GitHub
parent 412a6b1a48
commit 8b39b73a52
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 59 additions and 36 deletions

View File

@ -49,7 +49,7 @@ save="tioga"
## HIP ##
##########
module -q load rocm/5.1.1
module -q load rocm/5.2.0
# HIP without UM [benchmark, struct]
co="--with-hip --with-MPI-include=${MPICH_DIR}/include --with-MPI-lib-dirs=${MPICH_DIR}/lib --with-MPI-libs=mpi --with-gpu-arch='gfx90a' CC=cc CXX=CC"
@ -73,4 +73,3 @@ for errfile in $( find $output_dir ! -size 0 -name "*.err" )
do
echo $errfile >&2
done

View File

@ -118,6 +118,13 @@ hypre_ILUSetup( void *ilu_vdata,
#if defined (HYPRE_USING_GPU)
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(hypre_ParCSRMatrixMemoryLocation(A));
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
{
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
return hypre_error_flag;
}
#endif
/* ----- begin -----*/

View File

@ -118,6 +118,13 @@ hypre_ILUSolve( void *ilu_vdata,
#if defined(HYPRE_USING_GPU)
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
hypre_ParVectorMemoryLocation(f) );
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
{
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
return hypre_error_flag;
}
#endif
/* begin */

View File

@ -26,8 +26,8 @@ static __device__ __forceinline__
HYPRE_Int
hypre_spgemm_hash_insert_numer(
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *HashKeys,
HYPRE_Complex *HashVals,
HYPRE_Int *HashKeys,
HYPRE_Complex *HashVals,
#else
volatile HYPRE_Int *HashKeys,
volatile HYPRE_Complex *HashVals,
@ -38,7 +38,12 @@ hypre_spgemm_hash_insert_numer(
HYPRE_Int j = 0;
HYPRE_Int old = -1;
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
#pragma unroll 8
#else
#pragma unroll UNROLL_FACTOR
#endif
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
{
/* compute the hash value of key */
@ -87,8 +92,8 @@ static __device__ __forceinline__
HYPRE_Int
hypre_spgemm_hash_insert_numer( HYPRE_Int HashSize,
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *HashKeys,
HYPRE_Complex *HashVals,
HYPRE_Int *HashKeys,
HYPRE_Complex *HashVals,
#else
volatile HYPRE_Int *HashKeys,
volatile HYPRE_Complex *HashVals,
@ -160,8 +165,8 @@ hypre_spgemm_compute_row_numer( hypre_DeviceItem &item,
HYPRE_Int *jc,
HYPRE_Complex *ac,
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *s_HashKeys,
HYPRE_Complex *s_HashVals,
HYPRE_Int *s_HashKeys,
HYPRE_Complex *s_HashVals,
#else
volatile HYPRE_Int *s_HashKeys,
volatile HYPRE_Complex *s_HashVals,
@ -259,8 +264,8 @@ hypre_spgemm_copy_from_hash_into_C_row( hypre_DeviceItem &item,
HYPRE_Int lane_id,
HYPRE_Int do_shared_copy,
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *s_HashKeys,
HYPRE_Complex *s_HashVals,
HYPRE_Int *s_HashKeys,
HYPRE_Complex *s_HashVals,
#else
volatile HYPRE_Int *s_HashKeys,
volatile HYPRE_Complex *s_HashVals,
@ -691,13 +696,13 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
template <HYPRE_Int GROUP_SIZE>
__global__ void
hypre_spgemm_copy_from_Cext_into_C( hypre_DeviceItem &item,
HYPRE_Int M,
HYPRE_Int *ix,
HYPRE_Int *jx,
HYPRE_Complex *ax,
HYPRE_Int *ic,
HYPRE_Int *jc,
HYPRE_Complex *ac )
HYPRE_Int M,
HYPRE_Int *ix,
HYPRE_Int *jx,
HYPRE_Complex *ax,
HYPRE_Int *ic,
HYPRE_Int *jc,
HYPRE_Complex *ac )
{
/* number of groups in the grid */
#if defined(HYPRE_USING_SYCL)

View File

@ -54,7 +54,8 @@ hypreDevice_CSRSpGemmRocsparse(HYPRE_Int m,
/* For rocSPARSE, the CSR SpGEMM implementation does not require the columns to be sorted! */
/* RL: for matrices with long rows, it seemed that the sorting is still needed */
#if 0
/* VPM: Adding sorting back since it is necessary for correctness in a few cases */
#if 1
hypre_SortCSRRocsparse(m, k, nnzA, descrA, d_ia, d_ja_sorted, d_a_sorted);
hypre_SortCSRRocsparse(k, n, nnzB, descrB, d_ib, d_jb_sorted, d_b_sorted);
#endif

View File

@ -23,7 +23,7 @@ static __device__ __forceinline__
HYPRE_Int
hypre_spgemm_hash_insert_symbl(
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *HashKeys,
HYPRE_Int *HashKeys,
#else
volatile HYPRE_Int *HashKeys,
#endif
@ -33,7 +33,12 @@ hypre_spgemm_hash_insert_symbl(
HYPRE_Int j = 0;
HYPRE_Int old = -1;
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
#pragma unroll 8
#else
#pragma unroll UNROLL_FACTOR
#endif
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
{
/* compute the hash value of key */
@ -75,7 +80,7 @@ static __device__ __forceinline__
HYPRE_Int
hypre_spgemm_hash_insert_symbl( HYPRE_Int HashSize,
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *HashKeys,
HYPRE_Int *HashKeys,
#else
volatile HYPRE_Int *HashKeys,
#endif
@ -133,7 +138,7 @@ hypre_spgemm_compute_row_symbl( hypre_DeviceItem &item,
const HYPRE_Int *ib,
const HYPRE_Int *jb,
#if defined(HYPRE_USING_SYCL)
HYPRE_Int *s_HashKeys,
HYPRE_Int *s_HashKeys,
#else
volatile HYPRE_Int *s_HashKeys,
#endif
@ -226,7 +231,7 @@ template <HYPRE_Int NUM_GROUPS_PER_BLOCK, HYPRE_Int GROUP_SIZE, HYPRE_Int SHMEM_
__global__ void
hypre_spgemm_symbolic( hypre_DeviceItem &item,
#if defined(HYPRE_USING_SYCL)
char *shmem_ptr,
char *shmem_ptr,
#endif
const HYPRE_Int M,
const HYPRE_Int* __restrict__ rind,

View File

@ -62,16 +62,16 @@ PCG Solve wall clock time = 0.177456 seconds
PCG Setup wall clock time = 0.240707 seconds
PCG Solve wall clock time = 0.189251 seconds
# Output file: benchmark_ij.out.22
PCG Setup wall clock time = 0.239942 seconds
PCG Setup wall clock time = 0.291317 seconds
PCG Solve wall clock time = 0.168473 seconds
# Output file: benchmark_ij.out.23
PCG Setup wall clock time = 0.235985 seconds
PCG Setup wall clock time = 0.288176 seconds
PCG Solve wall clock time = 0.168673 seconds
# Output file: benchmark_ij.out.24
PCG Setup wall clock time = 0.217153 seconds
PCG Setup wall clock time = 0.258924 seconds
PCG Solve wall clock time = 0.185827 seconds
# Output file: benchmark_ij.out.25
PCG Setup wall clock time = 0.205445 seconds
PCG Setup wall clock time = 0.257346 seconds
PCG Solve wall clock time = 0.198855 seconds
# Output file: benchmark_ij.out.26
PCG Setup wall clock time = 0.265667 seconds

View File

@ -84,19 +84,19 @@ Final Relative Residual Norm = 4.476621e-09
# Output file: benchmark_ij.out.22
Iterations = 13
Final Relative Residual Norm = 1.703911e-09
Final Relative Residual Norm = 1.810780e-09
# Output file: benchmark_ij.out.23
Iterations = 13
Final Relative Residual Norm = 5.943877e-09
Final Relative Residual Norm = 6.103525e-09
# Output file: benchmark_ij.out.24
Iterations = 14
Final Relative Residual Norm = 3.250031e-09
Final Relative Residual Norm = 3.116077e-09
# Output file: benchmark_ij.out.25
Iterations = 15
Final Relative Residual Norm = 5.483580e-09
Final Relative Residual Norm = 5.382858e-09
# Output file: benchmark_ij.out.26
Iterations = 37
@ -105,4 +105,3 @@ Final Relative Residual Norm = 6.305999e-09
# Output file: benchmark_ij.out.27
Iterations = 22
Final Relative Residual Norm = 8.787135e-09

View File

@ -23,13 +23,13 @@ Device Parcsr Matrix-by-Matrix wall clock time = 0.016367 seconds
# Output file: benchmark_spgemm.out.12
Device Parcsr Matrix-by-Matrix wall clock time = 0.019611 seconds
# Output file: benchmark_spgemm.out.13
Device Parcsr Matrix-by-Matrix wall clock time = 0.008411 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.018017 seconds
# Output file: benchmark_spgemm.out.14
Device Parcsr Matrix-by-Matrix wall clock time = 0.130224 seconds
# Output file: benchmark_spgemm.out.15
Device Parcsr Matrix-by-Matrix wall clock time = 0.001776 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.006545 seconds
# Output file: benchmark_spgemm.out.16
Device Parcsr Matrix-by-Matrix wall clock time = 0.007971 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.013026 seconds
# Output file: benchmark_spgemm.out.17
Device Parcsr Matrix-by-Matrix wall clock time = 0.002087 seconds
# Output file: benchmark_spgemm.out.18

View File

@ -14,7 +14,7 @@ PCG Solve wall clock time = 7.285773 seconds
PCG Setup wall clock time = 0.012046 seconds
PCG Solve wall clock time = 0.095558 seconds
# Output file: benchmark_struct.out.6
PCG Setup wall clock time = 0.019933 seconds
PCG Setup wall clock time = 0.016907 seconds
PCG Solve wall clock time = 0.241860 seconds
# Output file: benchmark_struct.out.7
PCG Setup wall clock time = 0.053942 seconds

View File

@ -371,7 +371,6 @@ HYPRE_Finalize(void)
#endif
hypre_HandleDestroy(_hypre_handle);
_hypre_handle = NULL;
#if !defined(HYPRE_USING_SYCL)
@ -383,6 +382,7 @@ HYPRE_Finalize(void)
hypre_memory_tracker_print, hypre_memory_tracker_filename);
hypre_MemoryTrackerDestroy(_hypre_memory_tracker);
_hypre_memory_tracker = NULL;
#endif
return hypre_error_flag;