From ad5d7e009f32c1446614e906999c02fe5914ecfc Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 10 Jun 2021 11:10:13 -0700 Subject: [PATCH] Gpu mixedInt (#380) This PR adds GPU support for mixedInt. Co-authored-by: Rob Falgout --- AUTOTEST/machine-lassen.sh | 8 ++- AUTOTEST/machine-ray.sh | 8 ++- src/IJ_mv/IJMatrix_parcsr_device.c | 4 +- src/IJ_mv/IJVector_parcsr_device.c | 2 +- src/parcsr_ls/par_ilu_setup.c | 2 +- src/parcsr_ls/par_strength_device.c | 6 +- src/parcsr_mv/par_csr_fffc_device.c | 84 ++++++++++++++-------------- src/parcsr_mv/par_csr_matop_device.c | 2 +- src/seq_mv/csr_matop_device.c | 6 +- src/struct_mv/Makefile | 1 + src/utilities/Makefile | 3 +- src/utilities/_hypre_utilities.hpp | 2 +- src/utilities/cuda_utils.c | 53 ++++++++++++------ src/utilities/cuda_utils.h | 2 +- 14 files changed, 109 insertions(+), 74 deletions(-) diff --git a/AUTOTEST/machine-lassen.sh b/AUTOTEST/machine-lassen.sh index b9a6877b5..3d783ee7f 100755 --- a/AUTOTEST/machine-lassen.sh +++ b/AUTOTEST/machine-lassen.sh @@ -11,7 +11,7 @@ case $1 in -h|-help) cat <() ); - hypre_assert(tmp_end - col_map_offd_AFF == num_cols_AFF_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + thrust::make_transform_iterator(recv_buf, -_1-1), + thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd, + offd_mark, + col_map_offd_AFF, + thrust::identity() ); + hypre_assert(tmp_end_big - col_map_offd_AFF == num_cols_AFF_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); AFF = hypre_ParCSRMatrixCreate(comm, @@ -597,13 +597,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A, tmp_j, AFC_offd_j ); col_map_offd_AFC = hypre_TAlloc(HYPRE_BigInt, num_cols_AFC_offd, HYPRE_MEMORY_DEVICE); - tmp_end = HYPRE_THRUST_CALL( copy_if, - recv_buf, - recv_buf + num_cols_A_offd, - offd_mark, - col_map_offd_AFC, - thrust::identity()); - hypre_assert(tmp_end - col_map_offd_AFC == num_cols_AFC_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + recv_buf, + recv_buf + num_cols_A_offd, + offd_mark, + col_map_offd_AFC, + thrust::identity()); + hypre_assert(tmp_end_big - col_map_offd_AFC == num_cols_AFC_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); /* AFC */ @@ -745,13 +745,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A, tmp_j, ACF_offd_j ); col_map_offd_ACF = hypre_TAlloc(HYPRE_BigInt, num_cols_ACF_offd, HYPRE_MEMORY_DEVICE); - tmp_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_transform_iterator(recv_buf, -_1-1), - thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd, - offd_mark, - col_map_offd_ACF, - thrust::identity()); - hypre_assert(tmp_end - col_map_offd_ACF == num_cols_ACF_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + thrust::make_transform_iterator(recv_buf, -_1-1), + thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd, + offd_mark, + col_map_offd_ACF, + thrust::identity()); + hypre_assert(tmp_end_big - col_map_offd_ACF == num_cols_ACF_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); /* ACF */ @@ -894,13 +894,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A, tmp_j, ACC_offd_j ); col_map_offd_ACC = hypre_TAlloc(HYPRE_BigInt, num_cols_ACC_offd, HYPRE_MEMORY_DEVICE); - tmp_end = HYPRE_THRUST_CALL( copy_if, - recv_buf, - recv_buf + num_cols_A_offd, - offd_mark, - col_map_offd_ACC, - thrust::identity()); - hypre_assert(tmp_end - col_map_offd_ACC == num_cols_ACC_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + recv_buf, + recv_buf + num_cols_A_offd, + offd_mark, + col_map_offd_ACC, + thrust::identity()); + hypre_assert(tmp_end_big - col_map_offd_ACC == num_cols_ACC_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); /* ACC */ @@ -1217,13 +1217,13 @@ hypre_ParCSRMatrixGenerate1DCFDevice( hypre_ParCSRMatrix *A, tmp_j, ACX_offd_j ); col_map_offd_ACX = hypre_TAlloc(HYPRE_BigInt, num_cols_ACX_offd, HYPRE_MEMORY_DEVICE); - tmp_end = HYPRE_THRUST_CALL( copy_if, - col_map_offd_A, - col_map_offd_A + num_cols_A_offd, - offd_mark, - col_map_offd_ACX, - thrust::identity()); - hypre_assert(tmp_end - col_map_offd_ACX == num_cols_ACX_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + col_map_offd_A, + col_map_offd_A + num_cols_A_offd, + offd_mark, + col_map_offd_ACX, + thrust::identity()); + hypre_assert(tmp_end_big - col_map_offd_ACX == num_cols_ACX_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); /* ACX */ @@ -1354,13 +1354,13 @@ hypre_ParCSRMatrixGenerate1DCFDevice( hypre_ParCSRMatrix *A, tmp_j, AXC_offd_j ); col_map_offd_AXC = hypre_TAlloc(HYPRE_BigInt, num_cols_AXC_offd, HYPRE_MEMORY_DEVICE); - tmp_end = HYPRE_THRUST_CALL( copy_if, - recv_buf, - recv_buf + num_cols_A_offd, - offd_mark, - col_map_offd_AXC, - thrust::identity()); - hypre_assert(tmp_end - col_map_offd_AXC == num_cols_AXC_offd); + HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if, + recv_buf, + recv_buf + num_cols_A_offd, + offd_mark, + col_map_offd_AXC, + thrust::identity()); + hypre_assert(tmp_end_big - col_map_offd_AXC == num_cols_AXC_offd); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); /* AXC */ diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index 8643e7437..01bcf2be8 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -14,7 +14,7 @@ HYPRE_Int hypre_ParcsrGetExternalRowsDeviceInit( hypre_ParCSRMatrix *A, HYPRE_Int indices_len, - HYPRE_Int *indices, + HYPRE_BigInt *indices, hypre_ParCSRCommPkg *comm_pkg, HYPRE_Int want_data, void **request_ptr) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 2b57330b9..5af3fef58 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -432,9 +432,9 @@ hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: qu col_map_offd_C, col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - HYPRE_Int *new_end = HYPRE_THRUST_CALL( unique, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); num_cols_offd_C = new_end - col_map_offd_C; diff --git a/src/struct_mv/Makefile b/src/struct_mv/Makefile index 6d1b4ebad..d0b67c687 100644 --- a/src/struct_mv/Makefile +++ b/src/struct_mv/Makefile @@ -17,6 +17,7 @@ C_COMPILE_FLAGS = \ HEADERS =\ HYPRE_struct_mv.h\ _hypre_struct_mv.h\ + _hypre_struct_mv.hpp\ assumed_part.h\ box.h\ box_manager.h\ diff --git a/src/utilities/Makefile b/src/utilities/Makefile index 8281d38f3..cfd899e32 100644 --- a/src/utilities/Makefile +++ b/src/utilities/Makefile @@ -29,7 +29,8 @@ HEADERS =\ mpistubs.h\ threading.h\ timing.h\ - _hypre_utilities.h + _hypre_utilities.h\ + _hypre_utilities.hpp FILES =\ F90_HYPRE_error.c\ diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 75fe8ecb0..107f436ff 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -940,7 +940,7 @@ HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz); -HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_Int first_col, HYPRE_Int *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); +HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); diff --git a/src/utilities/cuda_utils.c b/src/utilities/cuda_utils.c index 4fd055d90..a07b999b6 100644 --- a/src/utilities/cuda_utils.c +++ b/src/utilities/cuda_utils.c @@ -160,11 +160,20 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di } __global__ void -hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int has_offd, - HYPRE_BigInt first_col, HYPRE_Int *d_col_map_offd_A, - HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, - HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, - HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab) +hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows, + HYPRE_Int *d_row_indices, + HYPRE_Int has_offd, + HYPRE_BigInt first_col, + HYPRE_BigInt *d_col_map_offd_A, + HYPRE_Int *d_diag_i, + HYPRE_Int *d_diag_j, + HYPRE_Complex *d_diag_a, + HYPRE_Int *d_offd_i, + HYPRE_Int *d_offd_j, + HYPRE_Complex *d_offd_a, + HYPRE_Int *d_ib, + HYPRE_BigInt *d_jb, + HYPRE_Complex *d_ab) { const HYPRE_Int global_warp_id = hypre_cuda_get_grid_warp_id<1,1>(); @@ -251,11 +260,21 @@ hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_ * If col_map_offd_A == NULL, use (-1 - d_offd_j) as column id * If nrows == 1 and d_ib == NULL, it means d_ib[0] = 0 */ HYPRE_Int -hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, - HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, - HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, - HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, - HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab) +hypreDevice_CopyParCSRRows(HYPRE_Int nrows, + HYPRE_Int *d_row_indices, + HYPRE_Int job, + HYPRE_Int has_offd, + HYPRE_BigInt first_col, + HYPRE_BigInt *d_col_map_offd_A, + HYPRE_Int *d_diag_i, + HYPRE_Int *d_diag_j, + HYPRE_Complex *d_diag_a, + HYPRE_Int *d_offd_i, + HYPRE_Int *d_offd_j, + HYPRE_Complex *d_offd_a, + HYPRE_Int *d_ib, + HYPRE_BigInt *d_jb, + HYPRE_Complex *d_ab) { /* trivial case */ if (nrows <= 0) @@ -368,6 +387,8 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ return hypre_error_flag; } +/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int + * Output: d_row_ind */ template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind) @@ -390,7 +411,7 @@ hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_ } template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); -#if defined(HYPRE_MIXEDINT) || defined(HYPRE_BIGINT) +#if defined(HYPRE_MIXEDINT) template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); #endif @@ -731,9 +752,9 @@ hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HY return hypre_error_flag; } -template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Int *vals, HYPRE_Int opt); -template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int *vals, HYPRE_Int opt); -template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, HYPRE_Complex *vals, HYPRE_Int opt); +template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Int *vals, HYPRE_Int opt); +template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int *vals, HYPRE_Int opt); +template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Complex *vals, HYPRE_Int opt); /* opt: * 0, (a,b) < (a',b') iff a < a' or (a = a' and b < b') [normal tupe comp] @@ -759,10 +780,10 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val return hypre_error_flag; } -#if defined(HYPRE_MIXEDINT) || defined(HYPRE_BIGINT) +template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt); +#if defined(HYPRE_MIXEDINT) template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt); #endif -template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt); template HYPRE_Int diff --git a/src/utilities/cuda_utils.h b/src/utilities/cuda_utils.h index 4394a892a..76e369bfe 100644 --- a/src/utilities/cuda_utils.h +++ b/src/utilities/cuda_utils.h @@ -875,7 +875,7 @@ HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz); -HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_Int first_col, HYPRE_Int *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); +HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);