Fixes for Rocm 5.4.3 (#902)

* Use unroll_factor=8 for rocm-5.4.3
* Add SortCSRRocsparse back
* Fix Wunused-variable warnings
* Set _hypre_memory_tracker to NULL after destroy
* Update tioga results after changing default rocm version to 5.2.0
This commit is contained in:
Victor A. P. Magri 2023-05-11 09:05:26 -04:00 committed by GitHub
parent 412a6b1a48
commit 8b39b73a52
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 59 additions and 36 deletions

View File

@ -49,7 +49,7 @@ save="tioga"
## HIP ##
##########
module -q load rocm/5.1.1
module -q load rocm/5.2.0
# HIP without UM [benchmark, struct]
co="--with-hip --with-MPI-include=${MPICH_DIR}/include --with-MPI-lib-dirs=${MPICH_DIR}/lib --with-MPI-libs=mpi --with-gpu-arch='gfx90a' CC=cc CXX=CC"
@ -73,4 +73,3 @@ for errfile in $( find $output_dir ! -size 0 -name "*.err" )
do
echo $errfile >&2
done

View File

@ -118,6 +118,13 @@ hypre_ILUSetup( void *ilu_vdata,
#if defined (HYPRE_USING_GPU)
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(hypre_ParCSRMatrixMemoryLocation(A));
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
{
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
return hypre_error_flag;
}
#endif
/* ----- begin -----*/

View File

@ -118,6 +118,13 @@ hypre_ILUSolve( void *ilu_vdata,
#if defined(HYPRE_USING_GPU)
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
hypre_ParVectorMemoryLocation(f) );
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
{
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
return hypre_error_flag;
}
#endif
/* begin */

View File

@ -38,7 +38,12 @@ hypre_spgemm_hash_insert_numer(
HYPRE_Int j = 0;
HYPRE_Int old = -1;
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
#pragma unroll 8
#else
#pragma unroll UNROLL_FACTOR
#endif
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
{
/* compute the hash value of key */

View File

@ -54,7 +54,8 @@ hypreDevice_CSRSpGemmRocsparse(HYPRE_Int m,
/* For rocSPARSE, the CSR SpGEMM implementation does not require the columns to be sorted! */
/* RL: for matrices with long rows, it seemed that the sorting is still needed */
#if 0
/* VPM: Adding sorting back since it is necessary for correctness in a few cases */
#if 1
hypre_SortCSRRocsparse(m, k, nnzA, descrA, d_ia, d_ja_sorted, d_a_sorted);
hypre_SortCSRRocsparse(k, n, nnzB, descrB, d_ib, d_jb_sorted, d_b_sorted);
#endif

View File

@ -33,7 +33,12 @@ hypre_spgemm_hash_insert_symbl(
HYPRE_Int j = 0;
HYPRE_Int old = -1;
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
#pragma unroll 8
#else
#pragma unroll UNROLL_FACTOR
#endif
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
{
/* compute the hash value of key */

View File

@ -62,16 +62,16 @@ PCG Solve wall clock time = 0.177456 seconds
PCG Setup wall clock time = 0.240707 seconds
PCG Solve wall clock time = 0.189251 seconds
# Output file: benchmark_ij.out.22
PCG Setup wall clock time = 0.239942 seconds
PCG Setup wall clock time = 0.291317 seconds
PCG Solve wall clock time = 0.168473 seconds
# Output file: benchmark_ij.out.23
PCG Setup wall clock time = 0.235985 seconds
PCG Setup wall clock time = 0.288176 seconds
PCG Solve wall clock time = 0.168673 seconds
# Output file: benchmark_ij.out.24
PCG Setup wall clock time = 0.217153 seconds
PCG Setup wall clock time = 0.258924 seconds
PCG Solve wall clock time = 0.185827 seconds
# Output file: benchmark_ij.out.25
PCG Setup wall clock time = 0.205445 seconds
PCG Setup wall clock time = 0.257346 seconds
PCG Solve wall clock time = 0.198855 seconds
# Output file: benchmark_ij.out.26
PCG Setup wall clock time = 0.265667 seconds

View File

@ -84,19 +84,19 @@ Final Relative Residual Norm = 4.476621e-09
# Output file: benchmark_ij.out.22
Iterations = 13
Final Relative Residual Norm = 1.703911e-09
Final Relative Residual Norm = 1.810780e-09
# Output file: benchmark_ij.out.23
Iterations = 13
Final Relative Residual Norm = 5.943877e-09
Final Relative Residual Norm = 6.103525e-09
# Output file: benchmark_ij.out.24
Iterations = 14
Final Relative Residual Norm = 3.250031e-09
Final Relative Residual Norm = 3.116077e-09
# Output file: benchmark_ij.out.25
Iterations = 15
Final Relative Residual Norm = 5.483580e-09
Final Relative Residual Norm = 5.382858e-09
# Output file: benchmark_ij.out.26
Iterations = 37
@ -105,4 +105,3 @@ Final Relative Residual Norm = 6.305999e-09
# Output file: benchmark_ij.out.27
Iterations = 22
Final Relative Residual Norm = 8.787135e-09

View File

@ -23,13 +23,13 @@ Device Parcsr Matrix-by-Matrix wall clock time = 0.016367 seconds
# Output file: benchmark_spgemm.out.12
Device Parcsr Matrix-by-Matrix wall clock time = 0.019611 seconds
# Output file: benchmark_spgemm.out.13
Device Parcsr Matrix-by-Matrix wall clock time = 0.008411 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.018017 seconds
# Output file: benchmark_spgemm.out.14
Device Parcsr Matrix-by-Matrix wall clock time = 0.130224 seconds
# Output file: benchmark_spgemm.out.15
Device Parcsr Matrix-by-Matrix wall clock time = 0.001776 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.006545 seconds
# Output file: benchmark_spgemm.out.16
Device Parcsr Matrix-by-Matrix wall clock time = 0.007971 seconds
Device Parcsr Matrix-by-Matrix wall clock time = 0.013026 seconds
# Output file: benchmark_spgemm.out.17
Device Parcsr Matrix-by-Matrix wall clock time = 0.002087 seconds
# Output file: benchmark_spgemm.out.18

View File

@ -14,7 +14,7 @@ PCG Solve wall clock time = 7.285773 seconds
PCG Setup wall clock time = 0.012046 seconds
PCG Solve wall clock time = 0.095558 seconds
# Output file: benchmark_struct.out.6
PCG Setup wall clock time = 0.019933 seconds
PCG Setup wall clock time = 0.016907 seconds
PCG Solve wall clock time = 0.241860 seconds
# Output file: benchmark_struct.out.7
PCG Setup wall clock time = 0.053942 seconds

View File

@ -371,7 +371,6 @@ HYPRE_Finalize(void)
#endif
hypre_HandleDestroy(_hypre_handle);
_hypre_handle = NULL;
#if !defined(HYPRE_USING_SYCL)
@ -383,6 +382,7 @@ HYPRE_Finalize(void)
hypre_memory_tracker_print, hypre_memory_tracker_filename);
hypre_MemoryTrackerDestroy(_hypre_memory_tracker);
_hypre_memory_tracker = NULL;
#endif
return hypre_error_flag;