Fixes for Rocm 5.4.3 (#902)
* Use unroll_factor=8 for rocm-5.4.3 * Add SortCSRRocsparse back * Fix Wunused-variable warnings * Set _hypre_memory_tracker to NULL after destroy * Update tioga results after changing default rocm version to 5.2.0
This commit is contained in:
parent
412a6b1a48
commit
8b39b73a52
@ -49,7 +49,7 @@ save="tioga"
|
||||
## HIP ##
|
||||
##########
|
||||
|
||||
module -q load rocm/5.1.1
|
||||
module -q load rocm/5.2.0
|
||||
|
||||
# HIP without UM [benchmark, struct]
|
||||
co="--with-hip --with-MPI-include=${MPICH_DIR}/include --with-MPI-lib-dirs=${MPICH_DIR}/lib --with-MPI-libs=mpi --with-gpu-arch='gfx90a' CC=cc CXX=CC"
|
||||
@ -73,4 +73,3 @@ for errfile in $( find $output_dir ! -size 0 -name "*.err" )
|
||||
do
|
||||
echo $errfile >&2
|
||||
done
|
||||
|
||||
|
||||
@ -118,6 +118,13 @@ hypre_ILUSetup( void *ilu_vdata,
|
||||
|
||||
#if defined (HYPRE_USING_GPU)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(hypre_ParCSRMatrixMemoryLocation(A));
|
||||
|
||||
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
|
||||
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
|
||||
{
|
||||
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ----- begin -----*/
|
||||
|
||||
@ -118,6 +118,13 @@ hypre_ILUSolve( void *ilu_vdata,
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
|
||||
hypre_ParVectorMemoryLocation(f) );
|
||||
|
||||
/* VPM: Placeholder check to avoid -Wunused-variable warning. TODO: remove this */
|
||||
if (exec != HYPRE_EXEC_DEVICE && exec != HYPRE_EXEC_HOST)
|
||||
{
|
||||
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Need to run either on host or device!");
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* begin */
|
||||
|
||||
@ -26,8 +26,8 @@ static __device__ __forceinline__
|
||||
HYPRE_Int
|
||||
hypre_spgemm_hash_insert_numer(
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Complex *HashVals,
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Complex *HashVals,
|
||||
#else
|
||||
volatile HYPRE_Int *HashKeys,
|
||||
volatile HYPRE_Complex *HashVals,
|
||||
@ -38,7 +38,12 @@ hypre_spgemm_hash_insert_numer(
|
||||
HYPRE_Int j = 0;
|
||||
HYPRE_Int old = -1;
|
||||
|
||||
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
|
||||
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
|
||||
#pragma unroll 8
|
||||
#else
|
||||
#pragma unroll UNROLL_FACTOR
|
||||
#endif
|
||||
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
|
||||
{
|
||||
/* compute the hash value of key */
|
||||
@ -87,8 +92,8 @@ static __device__ __forceinline__
|
||||
HYPRE_Int
|
||||
hypre_spgemm_hash_insert_numer( HYPRE_Int HashSize,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Complex *HashVals,
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Complex *HashVals,
|
||||
#else
|
||||
volatile HYPRE_Int *HashKeys,
|
||||
volatile HYPRE_Complex *HashVals,
|
||||
@ -160,8 +165,8 @@ hypre_spgemm_compute_row_numer( hypre_DeviceItem &item,
|
||||
HYPRE_Int *jc,
|
||||
HYPRE_Complex *ac,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *s_HashKeys,
|
||||
HYPRE_Complex *s_HashVals,
|
||||
HYPRE_Int *s_HashKeys,
|
||||
HYPRE_Complex *s_HashVals,
|
||||
#else
|
||||
volatile HYPRE_Int *s_HashKeys,
|
||||
volatile HYPRE_Complex *s_HashVals,
|
||||
@ -259,8 +264,8 @@ hypre_spgemm_copy_from_hash_into_C_row( hypre_DeviceItem &item,
|
||||
HYPRE_Int lane_id,
|
||||
HYPRE_Int do_shared_copy,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *s_HashKeys,
|
||||
HYPRE_Complex *s_HashVals,
|
||||
HYPRE_Int *s_HashKeys,
|
||||
HYPRE_Complex *s_HashVals,
|
||||
#else
|
||||
volatile HYPRE_Int *s_HashKeys,
|
||||
volatile HYPRE_Complex *s_HashVals,
|
||||
@ -691,13 +696,13 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
|
||||
template <HYPRE_Int GROUP_SIZE>
|
||||
__global__ void
|
||||
hypre_spgemm_copy_from_Cext_into_C( hypre_DeviceItem &item,
|
||||
HYPRE_Int M,
|
||||
HYPRE_Int *ix,
|
||||
HYPRE_Int *jx,
|
||||
HYPRE_Complex *ax,
|
||||
HYPRE_Int *ic,
|
||||
HYPRE_Int *jc,
|
||||
HYPRE_Complex *ac )
|
||||
HYPRE_Int M,
|
||||
HYPRE_Int *ix,
|
||||
HYPRE_Int *jx,
|
||||
HYPRE_Complex *ax,
|
||||
HYPRE_Int *ic,
|
||||
HYPRE_Int *jc,
|
||||
HYPRE_Complex *ac )
|
||||
{
|
||||
/* number of groups in the grid */
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
|
||||
@ -54,7 +54,8 @@ hypreDevice_CSRSpGemmRocsparse(HYPRE_Int m,
|
||||
|
||||
/* For rocSPARSE, the CSR SpGEMM implementation does not require the columns to be sorted! */
|
||||
/* RL: for matrices with long rows, it seemed that the sorting is still needed */
|
||||
#if 0
|
||||
/* VPM: Adding sorting back since it is necessary for correctness in a few cases */
|
||||
#if 1
|
||||
hypre_SortCSRRocsparse(m, k, nnzA, descrA, d_ia, d_ja_sorted, d_a_sorted);
|
||||
hypre_SortCSRRocsparse(k, n, nnzB, descrB, d_ib, d_jb_sorted, d_b_sorted);
|
||||
#endif
|
||||
|
||||
@ -23,7 +23,7 @@ static __device__ __forceinline__
|
||||
HYPRE_Int
|
||||
hypre_spgemm_hash_insert_symbl(
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Int *HashKeys,
|
||||
#else
|
||||
volatile HYPRE_Int *HashKeys,
|
||||
#endif
|
||||
@ -33,7 +33,12 @@ hypre_spgemm_hash_insert_symbl(
|
||||
HYPRE_Int j = 0;
|
||||
HYPRE_Int old = -1;
|
||||
|
||||
#if defined(HYPRE_USING_HIP) && (HIP_VERSION == 50422804)
|
||||
/* VPM: see https://github.com/hypre-space/hypre/issues/875 */
|
||||
#pragma unroll 8
|
||||
#else
|
||||
#pragma unroll UNROLL_FACTOR
|
||||
#endif
|
||||
for (HYPRE_Int i = 0; i < SHMEM_HASH_SIZE; i++)
|
||||
{
|
||||
/* compute the hash value of key */
|
||||
@ -75,7 +80,7 @@ static __device__ __forceinline__
|
||||
HYPRE_Int
|
||||
hypre_spgemm_hash_insert_symbl( HYPRE_Int HashSize,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *HashKeys,
|
||||
HYPRE_Int *HashKeys,
|
||||
#else
|
||||
volatile HYPRE_Int *HashKeys,
|
||||
#endif
|
||||
@ -133,7 +138,7 @@ hypre_spgemm_compute_row_symbl( hypre_DeviceItem &item,
|
||||
const HYPRE_Int *ib,
|
||||
const HYPRE_Int *jb,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
HYPRE_Int *s_HashKeys,
|
||||
HYPRE_Int *s_HashKeys,
|
||||
#else
|
||||
volatile HYPRE_Int *s_HashKeys,
|
||||
#endif
|
||||
@ -226,7 +231,7 @@ template <HYPRE_Int NUM_GROUPS_PER_BLOCK, HYPRE_Int GROUP_SIZE, HYPRE_Int SHMEM_
|
||||
__global__ void
|
||||
hypre_spgemm_symbolic( hypre_DeviceItem &item,
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
char *shmem_ptr,
|
||||
char *shmem_ptr,
|
||||
#endif
|
||||
const HYPRE_Int M,
|
||||
const HYPRE_Int* __restrict__ rind,
|
||||
|
||||
@ -62,16 +62,16 @@ PCG Solve wall clock time = 0.177456 seconds
|
||||
PCG Setup wall clock time = 0.240707 seconds
|
||||
PCG Solve wall clock time = 0.189251 seconds
|
||||
# Output file: benchmark_ij.out.22
|
||||
PCG Setup wall clock time = 0.239942 seconds
|
||||
PCG Setup wall clock time = 0.291317 seconds
|
||||
PCG Solve wall clock time = 0.168473 seconds
|
||||
# Output file: benchmark_ij.out.23
|
||||
PCG Setup wall clock time = 0.235985 seconds
|
||||
PCG Setup wall clock time = 0.288176 seconds
|
||||
PCG Solve wall clock time = 0.168673 seconds
|
||||
# Output file: benchmark_ij.out.24
|
||||
PCG Setup wall clock time = 0.217153 seconds
|
||||
PCG Setup wall clock time = 0.258924 seconds
|
||||
PCG Solve wall clock time = 0.185827 seconds
|
||||
# Output file: benchmark_ij.out.25
|
||||
PCG Setup wall clock time = 0.205445 seconds
|
||||
PCG Setup wall clock time = 0.257346 seconds
|
||||
PCG Solve wall clock time = 0.198855 seconds
|
||||
# Output file: benchmark_ij.out.26
|
||||
PCG Setup wall clock time = 0.265667 seconds
|
||||
|
||||
@ -84,19 +84,19 @@ Final Relative Residual Norm = 4.476621e-09
|
||||
|
||||
# Output file: benchmark_ij.out.22
|
||||
Iterations = 13
|
||||
Final Relative Residual Norm = 1.703911e-09
|
||||
Final Relative Residual Norm = 1.810780e-09
|
||||
|
||||
# Output file: benchmark_ij.out.23
|
||||
Iterations = 13
|
||||
Final Relative Residual Norm = 5.943877e-09
|
||||
Final Relative Residual Norm = 6.103525e-09
|
||||
|
||||
# Output file: benchmark_ij.out.24
|
||||
Iterations = 14
|
||||
Final Relative Residual Norm = 3.250031e-09
|
||||
Final Relative Residual Norm = 3.116077e-09
|
||||
|
||||
# Output file: benchmark_ij.out.25
|
||||
Iterations = 15
|
||||
Final Relative Residual Norm = 5.483580e-09
|
||||
Final Relative Residual Norm = 5.382858e-09
|
||||
|
||||
# Output file: benchmark_ij.out.26
|
||||
Iterations = 37
|
||||
@ -105,4 +105,3 @@ Final Relative Residual Norm = 6.305999e-09
|
||||
# Output file: benchmark_ij.out.27
|
||||
Iterations = 22
|
||||
Final Relative Residual Norm = 8.787135e-09
|
||||
|
||||
|
||||
@ -23,13 +23,13 @@ Device Parcsr Matrix-by-Matrix wall clock time = 0.016367 seconds
|
||||
# Output file: benchmark_spgemm.out.12
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.019611 seconds
|
||||
# Output file: benchmark_spgemm.out.13
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.008411 seconds
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.018017 seconds
|
||||
# Output file: benchmark_spgemm.out.14
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.130224 seconds
|
||||
# Output file: benchmark_spgemm.out.15
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.001776 seconds
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.006545 seconds
|
||||
# Output file: benchmark_spgemm.out.16
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.007971 seconds
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.013026 seconds
|
||||
# Output file: benchmark_spgemm.out.17
|
||||
Device Parcsr Matrix-by-Matrix wall clock time = 0.002087 seconds
|
||||
# Output file: benchmark_spgemm.out.18
|
||||
|
||||
@ -14,7 +14,7 @@ PCG Solve wall clock time = 7.285773 seconds
|
||||
PCG Setup wall clock time = 0.012046 seconds
|
||||
PCG Solve wall clock time = 0.095558 seconds
|
||||
# Output file: benchmark_struct.out.6
|
||||
PCG Setup wall clock time = 0.019933 seconds
|
||||
PCG Setup wall clock time = 0.016907 seconds
|
||||
PCG Solve wall clock time = 0.241860 seconds
|
||||
# Output file: benchmark_struct.out.7
|
||||
PCG Setup wall clock time = 0.053942 seconds
|
||||
|
||||
@ -371,7 +371,6 @@ HYPRE_Finalize(void)
|
||||
#endif
|
||||
|
||||
hypre_HandleDestroy(_hypre_handle);
|
||||
|
||||
_hypre_handle = NULL;
|
||||
|
||||
#if !defined(HYPRE_USING_SYCL)
|
||||
@ -383,6 +382,7 @@ HYPRE_Finalize(void)
|
||||
hypre_memory_tracker_print, hypre_memory_tracker_filename);
|
||||
|
||||
hypre_MemoryTrackerDestroy(_hypre_memory_tracker);
|
||||
_hypre_memory_tracker = NULL;
|
||||
#endif
|
||||
|
||||
return hypre_error_flag;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user