Gpu mixedInt (#380)

This PR adds GPU support for mixedInt. 

Co-authored-by: Rob Falgout <rfalgout@llnl.gov>
This commit is contained in:
Ruipeng Li 2021-06-10 11:10:13 -07:00 committed by GitHub
parent 1d9411c7ab
commit ad5d7e009f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 109 additions and 74 deletions

View File

@ -11,7 +11,7 @@ case $1 in
-h|-help) -h|-help)
cat <<EOF cat <<EOF
**** Only run this script on the lassen/ray cluster **** **** Only run this script on the lassen cluster ****
$0 [-h|-help] {src_dir} $0 [-h|-help] {src_dir}
@ -53,6 +53,12 @@ eo="-gpu -rt -mpibind -save ${save}"
./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
./renametest.sh basic $output_dir/basic-cuda-um ./renametest.sh basic $output_dir/basic-cuda-um
#CUDA with UM and mixed-int
co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
ro="-ij-mixed -ams -struct -sstruct-mixed -rt -mpibind -save ${save}"
./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
./renametest.sh basic $output_dir/basic-cuda-um-mixedint
# CUDA with UM with shared library [no run] # CUDA with UM with shared library [no run]
co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'" co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
./test.sh basic.sh $src_dir -co: $co -mo: $mo ./test.sh basic.sh $src_dir -co: $co -mo: $mo

View File

@ -11,7 +11,7 @@ case $1 in
-h|-help) -h|-help)
cat <<EOF cat <<EOF
**** Only run this script on the lassen/ray cluster **** **** Only run this script on the ray cluster ****
$0 [-h|-help] {src_dir} $0 [-h|-help] {src_dir}
@ -53,6 +53,12 @@ eo="-gpu -rt -mpibind -save ${save}"
./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
./renametest.sh basic $output_dir/basic-cuda-um ./renametest.sh basic $output_dir/basic-cuda-um
#CUDA with UM and mixed-int
co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
ro="-ij-mixed -ams -struct -sstruct-mixed -rt -mpibind -save ${save}"
./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
./renametest.sh basic $output_dir/basic-cuda-um-mixedint
# CUDA with UM with shared library [no run] # CUDA with UM with shared library [no run]
co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'" co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=\\'60 70\\' --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
./test.sh basic.sh $src_dir -co: $co -mo: $mo ./test.sh basic.sh $src_dir -co: $co -mo: $mo

View File

@ -307,8 +307,8 @@ hypre_IJMatrixAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big
{ {
hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, 0); hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, 0);
HYPRE_Int *I = hypre_TAlloc(HYPRE_Int, N0, HYPRE_MEMORY_DEVICE); HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE);
HYPRE_Int *J = hypre_TAlloc(HYPRE_Int, N0, HYPRE_MEMORY_DEVICE); HYPRE_BigInt *J = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE);
HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE);
/* output in X0: 0: keep, 1: zero-out */ /* output in X0: 0: keep, 1: zero-out */

View File

@ -309,7 +309,7 @@ hypre_IJVectorAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, char *X0,
I0 + N0, I0 + N0,
thrust::make_zip_iterator(thrust::make_tuple(X0, A0)) ); thrust::make_zip_iterator(thrust::make_tuple(X0, A0)) );
HYPRE_Int *I = hypre_TAlloc(HYPRE_Int, N0, HYPRE_MEMORY_DEVICE); HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE);
HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE);
/* output in X0: 0: keep, 1: zero-out */ /* output in X0: 0: keep, 1: zero-out */

View File

@ -3444,7 +3444,7 @@ hypre_ILUSetupRAPILU0Device(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int n,
S_row_starts[1] = global_start; S_row_starts[1] = global_start;
} }
S_row_starts = hypre_CTAlloc(HYPRE_Int,2,HYPRE_MEMORY_HOST); S_row_starts = hypre_CTAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
S_row_starts[1] = S_total_rows; S_row_starts[1] = S_total_rows;
S_row_starts[0] = S_total_rows - m; S_row_starts[0] = S_total_rows - m;
hypre_MPI_Allreduce(&m, &S_total_rows, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm); hypre_MPI_Allreduce(&m, &S_total_rows, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);

View File

@ -43,7 +43,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd); HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd); HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
HYPRE_Int *row_starts = hypre_ParCSRMatrixRowStarts(A); HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
HYPRE_Int num_variables = hypre_CSRMatrixNumRows(A_diag); HYPRE_Int num_variables = hypre_CSRMatrixNumRows(A_diag);
HYPRE_Int global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A); HYPRE_Int global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A);
HYPRE_Int num_nonzeros_diag; HYPRE_Int num_nonzeros_diag;
@ -178,9 +178,9 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
hypre_ParCSRMatrixCommPkg(S) = NULL; hypre_ParCSRMatrixCommPkg(S) = NULL;
hypre_ParCSRMatrixColMapOffd(S) = hypre_TAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); hypre_ParCSRMatrixColMapOffd(S) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd, HYPRE_MEMORY_HOST);
hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(S), hypre_ParCSRMatrixColMapOffd(A), hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(S), hypre_ParCSRMatrixColMapOffd(A),
HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST); HYPRE_BigInt, num_cols_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
hypre_ParCSRMatrixSocDiagJ(S) = S_temp_diag_j; hypre_ParCSRMatrixSocDiagJ(S) = S_temp_diag_j;
hypre_ParCSRMatrixSocOffdJ(S) = S_temp_offd_j; hypre_ParCSRMatrixSocOffdJ(S) = S_temp_offd_j;

View File

@ -450,13 +450,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
AFF_offd_j ); AFF_offd_j );
col_map_offd_AFF = hypre_TAlloc(HYPRE_BigInt, num_cols_AFF_offd, HYPRE_MEMORY_DEVICE); col_map_offd_AFF = hypre_TAlloc(HYPRE_BigInt, num_cols_AFF_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
thrust::make_transform_iterator(recv_buf, -_1-1), thrust::make_transform_iterator(recv_buf, -_1-1),
thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd, thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_AFF, col_map_offd_AFF,
thrust::identity<HYPRE_Int>() ); thrust::identity<HYPRE_Int>() );
hypre_assert(tmp_end - col_map_offd_AFF == num_cols_AFF_offd); hypre_assert(tmp_end_big - col_map_offd_AFF == num_cols_AFF_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
AFF = hypre_ParCSRMatrixCreate(comm, AFF = hypre_ParCSRMatrixCreate(comm,
@ -597,13 +597,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
AFC_offd_j ); AFC_offd_j );
col_map_offd_AFC = hypre_TAlloc(HYPRE_BigInt, num_cols_AFC_offd, HYPRE_MEMORY_DEVICE); col_map_offd_AFC = hypre_TAlloc(HYPRE_BigInt, num_cols_AFC_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
recv_buf, recv_buf,
recv_buf + num_cols_A_offd, recv_buf + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_AFC, col_map_offd_AFC,
thrust::identity<HYPRE_Int>()); thrust::identity<HYPRE_Int>());
hypre_assert(tmp_end - col_map_offd_AFC == num_cols_AFC_offd); hypre_assert(tmp_end_big - col_map_offd_AFC == num_cols_AFC_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
/* AFC */ /* AFC */
@ -745,13 +745,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
ACF_offd_j ); ACF_offd_j );
col_map_offd_ACF = hypre_TAlloc(HYPRE_BigInt, num_cols_ACF_offd, HYPRE_MEMORY_DEVICE); col_map_offd_ACF = hypre_TAlloc(HYPRE_BigInt, num_cols_ACF_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
thrust::make_transform_iterator(recv_buf, -_1-1), thrust::make_transform_iterator(recv_buf, -_1-1),
thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd, thrust::make_transform_iterator(recv_buf, -_1-1) + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_ACF, col_map_offd_ACF,
thrust::identity<HYPRE_Int>()); thrust::identity<HYPRE_Int>());
hypre_assert(tmp_end - col_map_offd_ACF == num_cols_ACF_offd); hypre_assert(tmp_end_big - col_map_offd_ACF == num_cols_ACF_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
/* ACF */ /* ACF */
@ -894,13 +894,13 @@ hypre_ParCSRMatrixGenerateFFFCDevice_core( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
ACC_offd_j ); ACC_offd_j );
col_map_offd_ACC = hypre_TAlloc(HYPRE_BigInt, num_cols_ACC_offd, HYPRE_MEMORY_DEVICE); col_map_offd_ACC = hypre_TAlloc(HYPRE_BigInt, num_cols_ACC_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
recv_buf, recv_buf,
recv_buf + num_cols_A_offd, recv_buf + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_ACC, col_map_offd_ACC,
thrust::identity<HYPRE_Int>()); thrust::identity<HYPRE_Int>());
hypre_assert(tmp_end - col_map_offd_ACC == num_cols_ACC_offd); hypre_assert(tmp_end_big - col_map_offd_ACC == num_cols_ACC_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
/* ACC */ /* ACC */
@ -1217,13 +1217,13 @@ hypre_ParCSRMatrixGenerate1DCFDevice( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
ACX_offd_j ); ACX_offd_j );
col_map_offd_ACX = hypre_TAlloc(HYPRE_BigInt, num_cols_ACX_offd, HYPRE_MEMORY_DEVICE); col_map_offd_ACX = hypre_TAlloc(HYPRE_BigInt, num_cols_ACX_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
col_map_offd_A, col_map_offd_A,
col_map_offd_A + num_cols_A_offd, col_map_offd_A + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_ACX, col_map_offd_ACX,
thrust::identity<HYPRE_Int>()); thrust::identity<HYPRE_Int>());
hypre_assert(tmp_end - col_map_offd_ACX == num_cols_ACX_offd); hypre_assert(tmp_end_big - col_map_offd_ACX == num_cols_ACX_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
/* ACX */ /* ACX */
@ -1354,13 +1354,13 @@ hypre_ParCSRMatrixGenerate1DCFDevice( hypre_ParCSRMatrix *A,
tmp_j, tmp_j,
AXC_offd_j ); AXC_offd_j );
col_map_offd_AXC = hypre_TAlloc(HYPRE_BigInt, num_cols_AXC_offd, HYPRE_MEMORY_DEVICE); col_map_offd_AXC = hypre_TAlloc(HYPRE_BigInt, num_cols_AXC_offd, HYPRE_MEMORY_DEVICE);
tmp_end = HYPRE_THRUST_CALL( copy_if, HYPRE_BigInt *tmp_end_big = HYPRE_THRUST_CALL( copy_if,
recv_buf, recv_buf,
recv_buf + num_cols_A_offd, recv_buf + num_cols_A_offd,
offd_mark, offd_mark,
col_map_offd_AXC, col_map_offd_AXC,
thrust::identity<HYPRE_Int>()); thrust::identity<HYPRE_Int>());
hypre_assert(tmp_end - col_map_offd_AXC == num_cols_AXC_offd); hypre_assert(tmp_end_big - col_map_offd_AXC == num_cols_AXC_offd);
hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE); hypre_TFree(tmp_j, HYPRE_MEMORY_DEVICE);
/* AXC */ /* AXC */

View File

@ -14,7 +14,7 @@
HYPRE_Int HYPRE_Int
hypre_ParcsrGetExternalRowsDeviceInit( hypre_ParCSRMatrix *A, hypre_ParcsrGetExternalRowsDeviceInit( hypre_ParCSRMatrix *A,
HYPRE_Int indices_len, HYPRE_Int indices_len,
HYPRE_Int *indices, HYPRE_BigInt *indices,
hypre_ParCSRCommPkg *comm_pkg, hypre_ParCSRCommPkg *comm_pkg,
HYPRE_Int want_data, HYPRE_Int want_data,
void **request_ptr) void **request_ptr)

View File

@ -432,9 +432,9 @@ hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: qu
col_map_offd_C, col_map_offd_C,
col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
HYPRE_Int *new_end = HYPRE_THRUST_CALL( unique, HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique,
col_map_offd_C, col_map_offd_C,
col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
num_cols_offd_C = new_end - col_map_offd_C; num_cols_offd_C = new_end - col_map_offd_C;

View File

@ -17,6 +17,7 @@ C_COMPILE_FLAGS = \
HEADERS =\ HEADERS =\
HYPRE_struct_mv.h\ HYPRE_struct_mv.h\
_hypre_struct_mv.h\ _hypre_struct_mv.h\
_hypre_struct_mv.hpp\
assumed_part.h\ assumed_part.h\
box.h\ box.h\
box_manager.h\ box_manager.h\

View File

@ -29,7 +29,8 @@ HEADERS =\
mpistubs.h\ mpistubs.h\
threading.h\ threading.h\
timing.h\ timing.h\
_hypre_utilities.h _hypre_utilities.h\
_hypre_utilities.hpp
FILES =\ FILES =\
F90_HYPRE_error.c\ F90_HYPRE_error.c\

View File

@ -940,7 +940,7 @@ HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz); HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz);
HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_Int first_col, HYPRE_Int *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);

View File

@ -160,11 +160,20 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
} }
__global__ void __global__ void
hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int has_offd, hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows,
HYPRE_BigInt first_col, HYPRE_Int *d_col_map_offd_A, HYPRE_Int *d_row_indices,
HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int has_offd,
HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_BigInt first_col,
HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab) HYPRE_BigInt *d_col_map_offd_A,
HYPRE_Int *d_diag_i,
HYPRE_Int *d_diag_j,
HYPRE_Complex *d_diag_a,
HYPRE_Int *d_offd_i,
HYPRE_Int *d_offd_j,
HYPRE_Complex *d_offd_a,
HYPRE_Int *d_ib,
HYPRE_BigInt *d_jb,
HYPRE_Complex *d_ab)
{ {
const HYPRE_Int global_warp_id = hypre_cuda_get_grid_warp_id<1,1>(); const HYPRE_Int global_warp_id = hypre_cuda_get_grid_warp_id<1,1>();
@ -251,11 +260,21 @@ hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_
* If col_map_offd_A == NULL, use (-1 - d_offd_j) as column id * If col_map_offd_A == NULL, use (-1 - d_offd_j) as column id
* If nrows == 1 and d_ib == NULL, it means d_ib[0] = 0 */ * If nrows == 1 and d_ib == NULL, it means d_ib[0] = 0 */
HYPRE_Int HYPRE_Int
hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, hypreDevice_CopyParCSRRows(HYPRE_Int nrows,
HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, HYPRE_Int *d_row_indices,
HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int job,
HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int has_offd,
HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab) HYPRE_BigInt first_col,
HYPRE_BigInt *d_col_map_offd_A,
HYPRE_Int *d_diag_i,
HYPRE_Int *d_diag_j,
HYPRE_Complex *d_diag_a,
HYPRE_Int *d_offd_i,
HYPRE_Int *d_offd_j,
HYPRE_Complex *d_offd_a,
HYPRE_Int *d_ib,
HYPRE_BigInt *d_jb,
HYPRE_Complex *d_ab)
{ {
/* trivial case */ /* trivial case */
if (nrows <= 0) if (nrows <= 0)
@ -368,6 +387,8 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
return hypre_error_flag; return hypre_error_flag;
} }
/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
* Output: d_row_ind */
template <typename T> template <typename T>
HYPRE_Int HYPRE_Int
hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind) hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind)
@ -390,7 +411,7 @@ hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_
} }
template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
#if defined(HYPRE_MIXEDINT) || defined(HYPRE_BIGINT) #if defined(HYPRE_MIXEDINT)
template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
#endif #endif
@ -731,9 +752,9 @@ hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HY
return hypre_error_flag; return hypre_error_flag;
} }
template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Int *vals, HYPRE_Int opt); template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Int *vals, HYPRE_Int opt);
template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int *vals, HYPRE_Int opt); template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int *vals, HYPRE_Int opt);
template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, HYPRE_Complex *vals, HYPRE_Int opt); template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Complex *vals, HYPRE_Int opt);
/* opt: /* opt:
* 0, (a,b) < (a',b') iff a < a' or (a = a' and b < b') [normal tupe comp] * 0, (a,b) < (a',b') iff a < a' or (a = a' and b < b') [normal tupe comp]
@ -759,10 +780,10 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val
return hypre_error_flag; return hypre_error_flag;
} }
#if defined(HYPRE_MIXEDINT) || defined(HYPRE_BIGINT) template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
#if defined(HYPRE_MIXEDINT)
template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt); template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
#endif #endif
template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
template <typename T1, typename T2, typename T3> template <typename T1, typename T2, typename T3>
HYPRE_Int HYPRE_Int

View File

@ -875,7 +875,7 @@ HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz); HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz);
HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_Int first_col, HYPRE_Int *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int job, HYPRE_Int has_offd, HYPRE_BigInt first_col, HYPRE_BigInt *d_col_map_offd_A, HYPRE_Int *d_diag_i, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);