diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7c4ba626a..78784e521 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -113,6 +113,10 @@ option(HYPRE_ENABLE_CUBLAS "Use cuBLAS" OFF) option(HYPRE_ENABLE_CURAND "Use cuRAND" ON) option(HYPRE_ENABLE_GPU_PROFILING "Use NVTX on CUDA" OFF) set(HYPRE_CUDA_SM "70" CACHE STRING "Target CUDA architecture.") +# oneAPI options +option(HYPRE_ENABLE_ONEMKLSPARSE "Use oneMKL sparse" ON) +option(HYPRE_ENABLE_ONEMKLBLAS "Use oneMKL blas" ON) +option(HYPRE_ENABLE_ONEMKLRAND "Use oneMKL rand" ON) option(TPL_DSUPERLU_LIBRARIES "List of absolute paths to SuperLU_Dist link libraries [].") option(TPL_DSUPERLU_INCLUDE_DIRS "List of absolute paths to SuperLU_Dist include directories [].") @@ -421,6 +425,26 @@ if (HYPRE_USING_CUDA) endif () endif () +if (HYPRE_USING_SYCL) + if (HYPRE_ENABLE_ONEMKLSPARSE) + set(HYPRE_USING_ONEMKLSPARSE ON CACHE BOOL "" FORCE) + endif() + if (HYPRE_ENABLE_ONEMKLBLAS) + set(HYPRE_USING_ONEMKLBLAS ON CACHE BOOL "" FORCE) + endif() + if (HYPRE_ENABLE_ONEMKLRAND) + set(HYPRE_USING_ONEMKLRAND ON CACHE BOOL "" FORCE) + endif() + if (HYPRE_USING_ONEMKLSPARSE OR HYPRE_USING_ONEMKLBLAS OR HYPRE_USING_ONEMKLRAND) + set(MKL_LINK static) + set(MKL_THREADING sequential) + find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}/lib/cmake/mkl") + target_compile_options(${PROJECT_NAME} PUBLIC $) + target_include_directories(${PROJECT_NAME} PUBLIC $) + target_link_libraries(${PROJECT_NAME} PUBLIC $) + endif() +endif() + if (HYPRE_USING_CALIPER) find_package(caliper REQUIRED) target_link_libraries(${PROJECT_NAME} PUBLIC caliper) diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 611f01e82..82e7fb9bf 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -282,7 +282,10 @@ hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_MemoryLocation m hypre_ParCSRMatrixInitialize_v2(par_matrix, memory_location); hypre_AuxParCSRMatrixInitialize_v2(aux_matrix, memory_location_aux); - if (memory_location_aux == HYPRE_MEMORY_HOST) + /* WM: TODO - implement for sycl... is this available for other non-cuda/hip gpu implementations? */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + if (hypre_GetExecPolicy1(memory_location_aux) == HYPRE_EXEC_HOST) +#endif { if (hypre_AuxParCSRMatrixDiagSizes(aux_matrix)) { diff --git a/src/IJ_mv/aux_parcsr_matrix.c b/src/IJ_mv/aux_parcsr_matrix.c index eedc1aaae..e82a80163 100644 --- a/src/IJ_mv/aux_parcsr_matrix.c +++ b/src/IJ_mv/aux_parcsr_matrix.c @@ -264,12 +264,15 @@ hypre_AuxParCSRMatrixInitialize_v2( hypre_AuxParCSRMatrix *matrix, return 0; } + /* WM: Q - added the macro guards here (since IJ assembly not yet ported to sycl)... is this OK/correct? */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (memory_location != HYPRE_MEMORY_HOST) { /* GPU assembly */ hypre_AuxParCSRMatrixNeedAux(matrix) = 1; } else +#endif { /* CPU assembly */ /* allocate stash for setting or adding off processor values */ diff --git a/src/config/HYPREConfig.cmake.in b/src/config/HYPREConfig.cmake.in index 1445fec26..50df165e8 100644 --- a/src/config/HYPREConfig.cmake.in +++ b/src/config/HYPREConfig.cmake.in @@ -36,6 +36,9 @@ set(HYPRE_ENABLE_DEVICE_POOL @HYPRE_ENABLE_DEVICE_POOL@) set(HYPRE_ENABLE_CUBLAS @HYPRE_ENABLE_CUBLAS@) set(HYPRE_ENABLE_CURAND @HYPRE_ENABLE_CURAND@) set(HYPRE_ENABLE_GPU_PROFILING @HYPRE_ENABLE_GPU_PROFILING@) +set(HYPRE_ENABLE_ONEMKLSPARSE @HYPRE_ENABLE_ONEMKLSPARSE@) +set(HYPRE_ENABLE_ONEMKLBLAS @HYPRE_ENABLE_ONEMKLBLAS@) +set(HYPRE_ENABLE_ONEMKLRAND @HYPRE_ENABLE_ONEMKLRAND@) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") diff --git a/src/config/HYPRE_config.h.cmake.in b/src/config/HYPRE_config.h.cmake.in index eb22ae733..4d1c22bae 100644 --- a/src/config/HYPRE_config.h.cmake.in +++ b/src/config/HYPRE_config.h.cmake.in @@ -103,6 +103,15 @@ /* Use NVTX */ #cmakedefine HYPRE_USING_NVTX 1 +/* Use oneMLK spasre */ +#cmakedefine HYPRE_USING_ONEMKLSPARSE 1 + +/* Use oneMLK blas */ +#cmakedefine HYPRE_USING_ONEMKLBLAS 1 + +/* Use oneMLK rand */ +#cmakedefine HYPRE_USING_ONEMKLRAND 1 + /* Use SuperLU_Dist */ #cmakedefine HYPRE_USING_DSUPERLU 1 diff --git a/src/config/configure.in b/src/config/configure.in index 06e6a2279..e53d2beca 100644 --- a/src/config/configure.in +++ b/src/config/configure.in @@ -211,6 +211,7 @@ hypre_using_onemklsparse=no hypre_using_onemklblas=no hypre_using_onemklrand=no +hypre_found_mkl=no dnl ********************************************************************* dnl * Initialize flag-check variables @@ -519,6 +520,39 @@ AS_HELP_STRING([--enable-gpu-aware-mpi], [hypre_gpu_mpi=no] ) +AC_ARG_ENABLE(onemklsparse, +AS_HELP_STRING([--enable-onemklsparse], + [Use oneMKL sparse (default is YES).]), +[case "${enableval}" in + yes) hypre_using_onemklsparse=yes ;; + no) hypre_using_onemklsparse=no ;; + *) hypre_using_onemklsparse=yes ;; + esac], +[hypre_using_onemklsparse=yes] +) + +AC_ARG_ENABLE(onemklblas, +AS_HELP_STRING([--enable-onemklblas], + [Use oneMKL blas (default is YES).]), +[case "${enableval}" in + yes) hypre_using_onemklblas=yes ;; + no) hypre_using_onemklblas=no ;; + *) hypre_using_onemklblas=yes ;; + esac], +[hypre_using_onemklblas=yes] +) + +AC_ARG_ENABLE(onemklrand, +AS_HELP_STRING([--enable-onemklrand], + [Use oneMKL rand (default is YES).]), +[case "${enableval}" in + yes) hypre_using_onemklrand=yes ;; + no) hypre_using_onemklrand=no ;; + *) hypre_using_onemklrand=yes ;; + esac], +[hypre_using_onemklrand=yes] +) + dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set dnl * Need to change to a new approach that always defines variable to some value @@ -2311,45 +2345,37 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"], AC_DEFINE(HYPRE_USING_GPU, 1, [Define to 1 if executing on GPU device]) AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used]) + dnl (Ab)Using CUCC when compiling HIP LINK_CC=${CUCC} LINK_CXX=${CUCC} - if test "$hypre_user_chose_cuflags" = "no" + SYCLFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + if test "$hypre_using_debug" = "yes" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" - if test "$hypre_using_debug" = "yes" - then - CUFLAGS="-O0 -Wall -g ${CUFLAGS}" - else - CUFLAGS="-g -O3 ${CUFLAGS}" - fi + SYCLFLAGS="-Wall -g ${SYCLFLAGS}" + else + SYCLFLAGS="-g -O3 ${SYCLFLAGS}" fi + dnl (Ab)Use CUFLAGS to capture SYCL compilation flags + if test "$hypre_user_chose_cuflags" = "no" + then + CUFLAGS="${SYCLFLAGS}" + fi - dnl If not in debug mode, at least -O2, but the user can override with - dnl with SYCLFLAGS on the configure line. If in debug mode, -O0 -Wall - dnl plus flags for debugging symbols - AS_IF([test x"$hypre_using_debug" == x"yes"], - [SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"], - [SYCLFLAGS="-O2 ${SYCLFLAGS}"],) - - AS_IF([test x"$hypre_using_onemklsparse" == x"yes"], - [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used]) - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp" + AS_IF([test x"$hypre_using_onemklsparse" == x"yes" || test x"$hypre_using_onemklblas" == x"yes" || test x"$hypre_using_onemklrand" == x"yes"], + [AC_CHECK_HEADERS(["${MKLROOT}/include/mkl.h"], + [hypre_found_mkl=yes], + AC_MSG_ERROR([unable to find oneMKL ... Ensure that MKLROOT is set])) + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} ${MKLROOT}/lib/intel64/libmkl_sycl.a -Wl,-export-dynamic -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_sequential.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -lpthread -lm -ldl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include" ]) + + AS_IF([test x"$hypre_using_onemklsparse" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used])]) - AS_IF([test x"$hypre_using_onemklblas" == x"yes"], - [AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used]) - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" - ]) + AS_IF([test x"$hypre_using_onemklblas" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used])]) - AS_IF([test x"$hypre_using_onemklrand" == x"yes"], - [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used]) - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp" - ]) + AS_IF([test x"$hypre_using_onemklrand" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used])]) ]) dnl AS_IF([test x"$hypre_using_sycl" == x"yes"] diff --git a/src/configure b/src/configure index 7993465af..c77c3a11c 100755 --- a/src/configure +++ b/src/configure @@ -792,6 +792,9 @@ enable_rocblas enable_rocrand enable_gpu_profiling enable_gpu_aware_mpi +enable_onemklsparse +enable_onemklblas +enable_onemklrand with_LD with_LDFLAGS with_extra_CFLAGS @@ -1520,6 +1523,9 @@ Optional Features: --enable-rocrand Use rocRAND (default is YES). --enable-gpu-profiling Use NVTX on CUDA, rocTX on HIP (default is NO). --enable-gpu-aware-mpi Use GPU memory aware MPI + --enable-onemklsparse Use oneMKL sparse (default is YES). + --enable-onemklblas Use oneMKL blas (default is YES). + --enable-onemklrand Use oneMKL rand (default is YES). Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] @@ -2819,6 +2825,7 @@ hypre_using_onemklsparse=no hypre_using_onemklblas=no hypre_using_onemklrand=no +hypre_found_mkl=no hypre_blas_lib_old_style=no hypre_blas_lib_dir_old_style=no @@ -3247,6 +3254,45 @@ else fi +# Check whether --enable-onemklsparse was given. +if test "${enable_onemklsparse+set}" = set; then : + enableval=$enable_onemklsparse; case "${enableval}" in + yes) hypre_using_onemklsparse=yes ;; + no) hypre_using_onemklsparse=no ;; + *) hypre_using_onemklsparse=yes ;; + esac +else + hypre_using_onemklsparse=yes + +fi + + +# Check whether --enable-onemklblas was given. +if test "${enable_onemklblas+set}" = set; then : + enableval=$enable_onemklblas; case "${enableval}" in + yes) hypre_using_onemklblas=yes ;; + no) hypre_using_onemklblas=no ;; + *) hypre_using_onemklblas=yes ;; + esac +else + hypre_using_onemklblas=yes + +fi + + +# Check whether --enable-onemklrand was given. +if test "${enable_onemklrand+set}" = set; then : + enableval=$enable_onemklrand; case "${enableval}" in + yes) hypre_using_onemklrand=yes ;; + no) hypre_using_onemklrand=no ;; + *) hypre_using_onemklrand=yes ;; + esac +else + hypre_using_onemklrand=yes + +fi + + if test "x$CC" = "x" then @@ -9138,24 +9184,40 @@ $as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h - LINK_CC=${CUCC} + LINK_CC=${CUCC} LINK_CXX=${CUCC} - if test "$hypre_user_chose_cuflags" = "no" + SYCLFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + if test "$hypre_using_debug" = "yes" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" - if test "$hypre_using_debug" = "yes" - then - CUFLAGS="-O0 -Wall -g ${CUFLAGS}" - else - CUFLAGS="-g -O3 ${CUFLAGS}" - fi + SYCLFLAGS="-Wall -g ${SYCLFLAGS}" + else + SYCLFLAGS="-g -O3 ${SYCLFLAGS}" fi + if test "$hypre_user_chose_cuflags" = "no" + then + CUFLAGS="${SYCLFLAGS}" + fi - if test x"$hypre_using_debug" == x"yes"; then : - SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}" -elif SYCLFLAGS="-O2 ${SYCLFLAGS}"; then : + if test x"$hypre_using_onemklsparse" == x"yes" || test x"$hypre_using_onemklblas" == x"yes" || test x"$hypre_using_onemklrand" == x"yes"; then : + for ac_header in "${MKLROOT}/include/mkl.h" +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + hypre_found_mkl=yes +else + as_fn_error $? "unable to find oneMKL ... Ensure that MKLROOT is set" "$LINENO" 5 +fi + +done + + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} ${MKLROOT}/lib/intel64/libmkl_sycl.a -Wl,-export-dynamic -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_sequential.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -lpthread -lm -ldl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include" fi @@ -9163,27 +9225,18 @@ fi $as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp" - fi if test x"$hypre_using_onemklblas" == x"yes"; then : $as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" - fi if test x"$hypre_using_onemklrand" == x"yes"; then : $as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp" - fi diff --git a/src/distributed_matrix/distributed_matrix_parcsr.c b/src/distributed_matrix/distributed_matrix_parcsr.c index 0df9ae59e..3c09b8e39 100644 --- a/src/distributed_matrix/distributed_matrix_parcsr.c +++ b/src/distributed_matrix/distributed_matrix_parcsr.c @@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix, // RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif return(ierr); diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c index c1d43292a..7dc6f2ec2 100644 --- a/src/parcsr_ls/ams.c +++ b/src/parcsr_ls/ams.c @@ -459,7 +459,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A) HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncComputeStream(hypre_handle()); return hypre_error_flag; } diff --git a/src/parcsr_ls/par_2s_interp.c b/src/parcsr_ls/par_2s_interp.c index 828d68c51..62f017849 100644 --- a/src/parcsr_ls/par_2s_interp.c +++ b/src/parcsr_ls/par_2s_interp.c @@ -632,25 +632,24 @@ hypre_BoomerAMGBuildModPartialExtInterp( hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("PartialExtInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) + { + ierr = hypre_BoomerAMGBuildModPartialExtInterpDevice(A, CF_marker, S, num_cpts_global, + num_old_cpts_global, + debug_flag, trunc_factor, max_elmts, P_ptr); + } + else +#endif { ierr = hypre_BoomerAMGBuildModPartialExtInterpHost(A, CF_marker, S, num_cpts_global, num_old_cpts_global, num_functions, dof_func, debug_flag, trunc_factor, max_elmts, P_ptr); } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - ierr = hypre_BoomerAMGBuildModPartialExtInterpDevice(A, CF_marker, S, num_cpts_global, - num_old_cpts_global, - debug_flag, trunc_factor, max_elmts, P_ptr); - } -#endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -1338,25 +1337,24 @@ hypre_BoomerAMGBuildModPartialExtPEInterp( hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("PartialExtPEInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) + { + ierr = hypre_BoomerAMGBuildModPartialExtPEInterpDevice(A, CF_marker, S, num_cpts_global, + num_old_cpts_global, + debug_flag, trunc_factor, max_elmts, P_ptr); + } + else +#endif { ierr = hypre_BoomerAMGBuildModPartialExtPEInterpHost(A, CF_marker, S, num_cpts_global, num_old_cpts_global, num_functions, dof_func, debug_flag, trunc_factor, max_elmts, P_ptr); } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - ierr = hypre_BoomerAMGBuildModPartialExtPEInterpDevice(A, CF_marker, S, num_cpts_global, - num_old_cpts_global, - debug_flag, trunc_factor, max_elmts, P_ptr); - } -#endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); diff --git a/src/parcsr_ls/par_amgdd_fac_cycle.c b/src/parcsr_ls/par_amgdd_fac_cycle.c index 2af436fe0..1fb300d66 100644 --- a/src/parcsr_ls/par_amgdd_fac_cycle.c +++ b/src/parcsr_ls/par_amgdd_fac_cycle.c @@ -217,7 +217,7 @@ hypre_BoomerAMGDD_FAC_Jacobi( void *amgdd_vdata, HYPRE_Int level, HYPRE_Int cycle_param ) { -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) hypre_ParAMGDDData *amgdd_data = (hypre_ParAMGDDData*) amgdd_vdata; hypre_AMGDDCompGrid *compGrid = hypre_ParAMGDDDataCompGrid(amgdd_data)[level]; HYPRE_MemoryLocation memory_location = hypre_AMGDDCompGridMemoryLocation(compGrid); @@ -558,7 +558,7 @@ hypre_BoomerAMGDD_FAC_CFL1Jacobi( void *amgdd_vdata, HYPRE_Int level, HYPRE_Int cycle_param ) { -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) hypre_ParAMGDDData *amgdd_data = (hypre_ParAMGDDData*) amgdd_vdata; hypre_AMGDDCompGrid *compGrid = hypre_ParAMGDDDataCompGrid(amgdd_data)[level]; HYPRE_MemoryLocation memory_location = hypre_AMGDDCompGridMemoryLocation(compGrid); diff --git a/src/parcsr_ls/par_amgdd_fac_cycle_device.c b/src/parcsr_ls/par_amgdd_fac_cycle_device.c index 9ec4f4a70..dbc1f6fbd 100644 --- a/src/parcsr_ls/par_amgdd_fac_cycle_device.c +++ b/src/parcsr_ls/par_amgdd_fac_cycle_device.c @@ -8,7 +8,7 @@ #include "_hypre_parcsr_ls.h" #include "_hypre_utilities.hpp" -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) HYPRE_Int hypre_BoomerAMGDD_FAC_JacobiDevice( void *amgdd_vdata, @@ -149,4 +149,4 @@ hypre_BoomerAMGDD_FAC_CFL1JacobiDevice( void *amgdd_vdata, return hypre_error_flag; } -#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) diff --git a/src/parcsr_ls/par_mod_lr_interp.c b/src/parcsr_ls/par_mod_lr_interp.c index e3a9c9e46..5447c54c9 100644 --- a/src/parcsr_ls/par_mod_lr_interp.c +++ b/src/parcsr_ls/par_mod_lr_interp.c @@ -554,23 +554,22 @@ hypre_BoomerAMGBuildModExtInterp(hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("ModExtInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) + { + ierr = hypre_BoomerAMGBuildExtInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL, + debug_flag, trunc_factor, max_elmts, P_ptr); + } + else +#endif { ierr = hypre_BoomerAMGBuildModExtInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func, debug_flag, trunc_factor, max_elmts, P_ptr); } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - ierr = hypre_BoomerAMGBuildExtInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL, - debug_flag, trunc_factor, max_elmts, P_ptr); - } -#endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -1869,23 +1868,22 @@ hypre_BoomerAMGBuildModExtPEInterp(hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("ModExtPEInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) + { + ierr = hypre_BoomerAMGBuildExtPEInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL, + debug_flag, trunc_factor, max_elmts, P_ptr); + } + else +#endif { ierr = hypre_BoomerAMGBuildModExtPEInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func, debug_flag, trunc_factor, max_elmts, P_ptr); } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - ierr = hypre_BoomerAMGBuildExtPEInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL, - debug_flag, trunc_factor, max_elmts, P_ptr); - } -#endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); diff --git a/src/parcsr_ls/par_mod_multi_interp.c b/src/parcsr_ls/par_mod_multi_interp.c index b3030a832..7828c15c2 100644 --- a/src/parcsr_ls/par_mod_multi_interp.c +++ b/src/parcsr_ls/par_mod_multi_interp.c @@ -122,8 +122,8 @@ hypre_BoomerAMGBuildModMultipassHost( hypre_ParCSRMatrix *A, pass_starts = hypre_CTAlloc(HYPRE_Int, 10, HYPRE_MEMORY_HOST); /* contains beginning for each pass in pass_order field, assume no more than 10 passes */ - P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST); - P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST); + P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE); + P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE); cnt = 0; remaining = 0; @@ -351,10 +351,10 @@ hypre_BoomerAMGBuildModMultipassHost( hypre_ParCSRMatrix *A, P_offd_i[i + 1] += P_offd_i[i]; } - P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_i[n_fine], HYPRE_MEMORY_HOST); - P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_i[n_fine], HYPRE_MEMORY_HOST); - P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_i[n_fine], HYPRE_MEMORY_HOST); - P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_i[n_fine], HYPRE_MEMORY_HOST); + P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_i[n_fine], HYPRE_MEMORY_DEVICE); + P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_i[n_fine], HYPRE_MEMORY_DEVICE); + P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_i[n_fine], HYPRE_MEMORY_DEVICE); + P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_i[n_fine], HYPRE_MEMORY_DEVICE); /* insert weights for coarse points */ for (i = 0; i < pass_starts[1]; i++) diff --git a/src/parcsr_ls/par_multi_interp.c b/src/parcsr_ls/par_multi_interp.c index 89abb7d96..ffca25999 100644 --- a/src/parcsr_ls/par_multi_interp.c +++ b/src/parcsr_ls/par_multi_interp.c @@ -255,8 +255,8 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix *A, if (pass_array_size) { pass_array = hypre_CTAlloc(HYPRE_Int, pass_array_size, HYPRE_MEMORY_HOST); } pass_pointer = hypre_CTAlloc(HYPRE_Int, max_num_passes + 1, HYPRE_MEMORY_HOST); if (n_fine) { assigned = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); } - P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST); - P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST); + P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE); + P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE); if (n_coarse) { C_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); } if (num_cols_offd) @@ -1138,14 +1138,14 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix *A, hypre_TFree(cnt_nz_offd_per_thread, HYPRE_MEMORY_HOST); hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST); - P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_HOST); - P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_HOST); + P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_DEVICE); + P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_DEVICE); if (total_nz_offd) { - P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_HOST); - P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_HOST); + P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_DEVICE); + P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_DEVICE); } for (i = 0; i < n_fine; i++) diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c index 608bc4209..9d584d328 100644 --- a/src/parcsr_ls/par_relax.c +++ b/src/parcsr_ls/par_relax.c @@ -1145,7 +1145,7 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif return hypre_error_flag; diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c index b396390f7..f6b55e243 100644 --- a/src/parcsr_ls/par_relax_more_device.c +++ b/src/parcsr_ls/par_relax_more_device.c @@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, rowsums_upper, scale); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0, thrust::minimum()); @@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A, /* matrix to relax /* set residual to random */ hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); HYPRE_THRUST_CALL(transform, r_data, r_data + local_size, r_data, diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 35fef28c8..670dd2e15 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, recv_data = recv_data_in; // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI. // Need to check MPI documentation whether this is acutally true - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif num_requests = num_sends + num_recvs; diff --git a/src/parcsr_mv/par_csr_matop.c b/src/parcsr_mv/par_csr_matop.c index 8eeb6dcf4..11aac0b06 100644 --- a/src/parcsr_mv/par_csr_matop.c +++ b/src/parcsr_mv/par_csr_matop.c @@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix *A, if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE ) { hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C)); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); } #endif diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index 9387a863f..71c3c1d7b 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A) hypre_CSRMatrixData(B) = B_a; hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return B; } @@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix *mat, *values = hypre_ParCSRMatrixRowvalues(mat); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return hypre_error_flag; } @@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA, HYPRE_Int ierr = 0; #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncComputeStream(hypre_handle()); #else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c index 30921fe96..dea08f6fc 100644 --- a/src/parcsr_mv/par_csr_matvec.c +++ b/src/parcsr_mv/par_csr_matvec.c @@ -349,7 +349,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -725,7 +725,7 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE diff --git a/src/parcsr_mv/par_csr_triplemat_device.c b/src/parcsr_mv/par_csr_triplemat_device.c index 0b8a67fd6..43c2cd402 100644 --- a/src/parcsr_mv/par_csr_triplemat_device.c +++ b/src/parcsr_mv/par_csr_triplemat_device.c @@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix *A, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return C; } @@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return C; } diff --git a/src/seq_mv/csr_matop.c b/src/seq_mv/csr_matop.c index 550db90d7..aec4cfe93 100644 --- a/src/seq_mv/csr_matop.c +++ b/src/seq_mv/csr_matop.c @@ -1269,7 +1269,7 @@ hypre_CSRMatrixTranspose(hypre_CSRMatrix *A, { HYPRE_Int ierr = 0; -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) ); if (exec == HYPRE_EXEC_DEVICE) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 5f56789ae..2b5193868 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -15,7 +15,7 @@ #include "_hypre_utilities.hpp" #include "seq_mv.hpp" -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CsrsvData* hypre_CsrsvDataCreate() { @@ -82,9 +82,33 @@ hypre_GpuMatDataCreate() hypre_GpuMatDataMatInfo(data) = info; #endif +#if defined(HYPRE_USING_ONEMKLSPARSE) + oneapi::mkl::sparse::matrix_handle_t mat_handle; + HYPRE_SYCL_CALL( oneapi::mkl::sparse::init_matrix_handle(&mat_handle) ); + hypre_GpuMatDataMatHandle(data) = mat_handle; +#endif + return data; } +void +hypre_GPUMatDataSetCSRData( hypre_GpuMatData *data, + hypre_CSRMatrix *matrix) +{ + +#if defined(HYPRE_USING_ONEMKLSPARSE) + oneapi::mkl::sparse::matrix_handle_t mat_handle = hypre_GpuMatDataMatHandle(data); + HYPRE_SYCL_CALL( oneapi::mkl::sparse::set_csr_data(mat_handle, + hypre_CSRMatrixNumRows(matrix), + hypre_CSRMatrixNumCols(matrix), + oneapi::mkl::index_base::zero, + hypre_CSRMatrixI(matrix), + hypre_CSRMatrixJ(matrix), + hypre_CSRMatrixData(matrix)) ); +#endif + +} + void hypre_GpuMatDataDestroy(hypre_GpuMatData *data) { @@ -103,6 +127,10 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data) HYPRE_ROCSPARSE_CALL( rocsparse_destroy_mat_info(hypre_GpuMatDataMatInfo(data)) ); #endif +#if defined(HYPRE_USING_ONEMKLSPARSE) + HYPRE_SYCL_CALL( oneapi::mkl::sparse::release_matrix_handle(&hypre_GpuMatDataMatHandle(data)) ); +#endif + hypre_TFree(data, HYPRE_MEMORY_HOST); } @@ -151,7 +179,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return C; } @@ -174,7 +202,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, hypreDevice_CSRSpGemm(A, B, &C); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return C; } @@ -326,7 +354,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, *B_ext_diag_ptr = B_ext_diag; *B_ext_offd_ptr = B_ext_offd; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return ierr; } @@ -576,7 +604,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return C; } @@ -618,7 +646,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return hypre_error_flag; } @@ -681,7 +709,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim, nrows, A_i, A_j, A_data); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return hypre_error_flag; } @@ -725,7 +753,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) hypre_TFree(result, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return ierr; } @@ -822,7 +850,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return ierr; } @@ -917,7 +945,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return ierr; } @@ -1113,7 +1141,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, row_sum, scal, set_or_add[0] == 's' ); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); } /* type 0: diag @@ -1206,7 +1234,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); } /* return C = [A; B] */ @@ -1530,9 +1558,6 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, #elif defined(HYPRE_USING_ROCSPARSE) hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); -#elif defined(HYPRE_USING_ONEMKLSPARSE) - hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); #else hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #endif @@ -1546,7 +1571,7 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, *AT_ptr = C; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); return hypre_error_flag; } diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c index 275625ec9..e59aab345 100644 --- a/src/seq_mv/csr_matrix.c +++ b/src/seq_mv/csr_matrix.c @@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows, /* set defaults */ hypre_CSRMatrixOwnsData(matrix) = 1; -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CSRMatrixSortedJ(matrix) = NULL; hypre_CSRMatrixSortedData(matrix) = NULL; hypre_CSRMatrixCsrsvData(matrix) = NULL; @@ -78,7 +78,7 @@ hypre_CSRMatrixDestroy( hypre_CSRMatrix *matrix ) hypre_TFree(hypre_CSRMatrixBigJ(matrix), memory_location); } -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_TFree(hypre_CSRMatrixSortedData(matrix), memory_location); hypre_TFree(hypre_CSRMatrixSortedJ(matrix), memory_location); hypre_CsrsvDataDestroy(hypre_CSRMatrixCsrsvData(matrix)); @@ -1074,7 +1074,7 @@ hypre_CSRMatrixPrefetch( hypre_CSRMatrix *A, HYPRE_MemoryLocation memory_locatio return ierr; } -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_GpuMatData * hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix) { @@ -1086,6 +1086,7 @@ hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix) if (!hypre_CSRMatrixGPUMatData(matrix)) { hypre_CSRMatrixGPUMatData(matrix) = hypre_GpuMatDataCreate(); + hypre_GPUMatDataSetCSRData(hypre_CSRMatrixGPUMatData(matrix), matrix); } return hypre_CSRMatrixGPUMatData(matrix); diff --git a/src/seq_mv/csr_matrix.h b/src/seq_mv/csr_matrix.h index a0dde52e3..eb6d50f4a 100644 --- a/src/seq_mv/csr_matrix.h +++ b/src/seq_mv/csr_matrix.h @@ -16,12 +16,12 @@ #ifndef hypre_CSR_MATRIX_HEADER #define hypre_CSR_MATRIX_HEADER -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) struct hypre_CsrsvData; typedef struct hypre_CsrsvData hypre_CsrsvData; #endif -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) struct hypre_GpuMatData; typedef struct hypre_GpuMatData hypre_GpuMatData; #endif @@ -45,7 +45,7 @@ typedef struct HYPRE_Int *rownnz; /* for compressing rows in matrix multiplication */ HYPRE_Int num_rownnz; HYPRE_MemoryLocation memory_location; /* memory location of arrays i, j, data */ -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) HYPRE_Int *sorted_j; /* some cusparse routines require sorted CSR */ HYPRE_Complex *sorted_data; hypre_CsrsvData *csrsv_data; @@ -69,7 +69,7 @@ typedef struct #define hypre_CSRMatrixOwnsData(matrix) ((matrix) -> owns_data) #define hypre_CSRMatrixMemoryLocation(matrix) ((matrix) -> memory_location) -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) #define hypre_CSRMatrixSortedJ(matrix) ((matrix) -> sorted_j) #define hypre_CSRMatrixSortedData(matrix) ((matrix) -> sorted_data) #define hypre_CSRMatrixCsrsvData(matrix) ((matrix) -> csrsv_data) diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c index a743292fe..267710ca3 100644 --- a/src/seq_mv/csr_matvec.c +++ b/src/seq_mv/csr_matvec.c @@ -712,10 +712,6 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) ); - /* WM: TODO - remove after sycl implementation in place */ -#if defined(HYPRE_USING_SYCL) - exec = HYPRE_EXEC_HOST; -#endif if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset); diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index 811040a51..fc681d475 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -13,7 +13,6 @@ #include "seq_mv.h" #include "_hypre_utilities.hpp" -#include "seq_mv.hpp" #if defined(HYPRE_USING_GPU) @@ -53,8 +52,6 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int trans, hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset); #elif defined(HYPRE_USING_ONEMKLSPARSE) hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset); - // WM: TODO: remove trivial HYPRE_USING_SYCL branch after onemlksparse implementation is in -#elif defined(HYPRE_USING_SYCL) #else // #ifdef HYPRE_USING_CUSPARSE #error HYPRE SPMV TODO #endif @@ -117,7 +114,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int trans, hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -201,7 +198,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, #endif dBuffer) ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); if (trans) { @@ -328,7 +325,29 @@ hypre_CSRMatrixMatvecOnemklsparse( HYPRE_Int trans, hypre_Vector *y, HYPRE_Int offset ) { - /* WM: TODO */ + sycl::queue *compute_queue = hypre_HandleComputeStream(hypre_handle()); + hypre_CSRMatrix *AT; + oneapi::mkl::sparse::matrix_handle_t matA_handle = hypre_CSRMatrixGPUMatHandle(A); + + if (trans) + { + hypre_CSRMatrixTransposeDevice(A, &AT, 1); + matA_handle = hypre_CSRMatrixGPUMatHandle(AT); + } + + HYPRE_SYCL_CALL( oneapi::mkl::sparse::gemv(*compute_queue, + oneapi::mkl::transpose::nontrans, + alpha, + matA_handle, + hypre_VectorData(x), + beta, + hypre_VectorData(y) + offset).wait() ); + + if (trans) + { + hypre_CSRMatrixDestroy(AT); + } + return hypre_error_flag; } #endif // #if defined(HYPRE_USING_ROCSPARSE) diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c index 7d44c2cd0..65ed87e52 100644 --- a/src/seq_mv/csr_spgemm_device.c +++ b/src/seq_mv/csr_spgemm_device.c @@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("Rownnz time %f\n", t2); #endif @@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, 1 /* exact row nnz */, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, d_rc + 2 * m, thrust::identity() ); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzBound time %f\n", t2); #endif @@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, rownnz_exact, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c index 548665ed2..999f7d46d 100644 --- a/src/seq_mv/csr_sptrans_device.c +++ b/src/seq_mv/csr_sptrans_device.c @@ -5,6 +5,7 @@ * SPDX-License-Identifier: (Apache-2.0 OR MIT) ******************************************************************************/ +#include "_hypre_onedpl.hpp" #include "seq_mv.h" #include "_hypre_utilities.hpp" @@ -146,19 +147,6 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPR #endif // #if defined(HYPRE_USING_ROCSPARSE) -#if defined(HYPRE_USING_ONEMKLSPARSE) -HYPRE_Int -hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, - HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, - HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, - HYPRE_Int want_data) -{ - /* WM: TODO */ - return hypre_error_flag; -} -#endif // #if defined(HYPRE_USING_ONEMKLSPARSE) - - #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) HYPRE_Int @@ -242,7 +230,79 @@ hypreDevice_CSRSpTrans(HYPRE_Int m, HYPRE_Int n, HYPRE_Int HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data) { - /* WM: TODO */ +#ifdef HYPRE_PROFILE + hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] -= hypre_MPI_Wtime(); +#endif + + HYPRE_Int *d_jt, *d_it, *d_pm, *d_ic, *d_jc; + HYPRE_Complex *d_ac = NULL; + HYPRE_Int *mem_work = hypre_TAlloc(HYPRE_Int, 3 * nnzA, HYPRE_MEMORY_DEVICE); + + /* allocate C */ + d_jc = hypre_TAlloc(HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE); + if (want_data) + { + d_ac = hypre_TAlloc(HYPRE_Complex, nnzA, HYPRE_MEMORY_DEVICE); + } + + /* permutation vector */ + d_pm = mem_work; + + /* expansion: A's row idx */ + d_it = d_pm + nnzA; + hypreDevice_CsrRowPtrsToIndices_v2(m, nnzA, d_ia, d_it); + + /* a copy of col idx of A */ + d_jt = d_it + nnzA; + hypre_TMemcpy(d_jt, d_ja, HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + /* sort: by col */ + oneapi::dpl::counting_iterator count(0); + HYPRE_ONEDPL_CALL( std::copy, + count, + count + nnzA, + d_pm); + + auto zip_jt_pm = oneapi::dpl::make_zip_iterator(d_jt, d_pm); + HYPRE_ONEDPL_CALL( std::stable_sort, + zip_jt_pm, + zip_jt_pm + nnzA, + [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } ); + + auto permuted_it = oneapi::dpl::make_permutation_iterator(d_it, d_pm); + HYPRE_ONEDPL_CALL( std::copy, + permuted_it, + permuted_it + nnzA, + d_jc ); + + if (want_data) + { + auto permuted_aa = oneapi::dpl::make_permutation_iterator(d_aa, d_pm); + HYPRE_ONEDPL_CALL( std::copy, + permuted_aa, + permuted_aa + nnzA, + d_ac ); + } + + /* convert into ic: row idx --> row ptrs */ + d_ic = hypreDevice_CsrRowIndicesToPtrs(n, nnzA, d_jt); + +#ifdef HYPRE_DEBUG + HYPRE_Int nnzC; + hypre_TMemcpy(&nnzC, &d_ic[n], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + hypre_assert(nnzC == nnzA); +#endif + + hypre_TFree(mem_work, HYPRE_MEMORY_DEVICE); + + *d_ic_out = d_ic; + *d_jc_out = d_jc; + *d_ac_out = d_ac; + +#ifdef HYPRE_PROFILE + hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime(); +#endif + return hypre_error_flag; } #endif // #if defined(HYPRE_USING_SYCL) diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h index 9081b58c2..c1863b1d1 100644 --- a/src/seq_mv/protos.h +++ b/src/seq_mv/protos.h @@ -315,14 +315,16 @@ HYPRE_Int hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, hypre_Vector *b, hypre HYPRE_Int hypre_CSRMatrixSpMVDevice( HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int fill ); -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CsrsvData* hypre_CsrsvDataCreate(); void hypre_CsrsvDataDestroy(hypre_CsrsvData *data); hypre_GpuMatData* hypre_GpuMatDataCreate(); +void hypre_GPUMatDataSetCSRData(hypre_GpuMatData *data, hypre_CSRMatrix *matrix); void hypre_GpuMatDataDestroy(hypre_GpuMatData *data); hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix); #define hypre_CSRMatrixGPUMatDescr(matrix) ( hypre_GpuMatDataMatDecsr(hypre_CSRMatrixGetGPUMatData(matrix)) ) #define hypre_CSRMatrixGPUMatInfo(matrix) ( hypre_GpuMatDataMatInfo (hypre_CSRMatrixGetGPUMatData(matrix)) ) +#define hypre_CSRMatrixGPUMatHandle(matrix) ( hypre_GpuMatDataMatHandle (hypre_CSRMatrixGetGPUMatData(matrix)) ) #define hypre_CSRMatrixGPUMatSpMVBuffer(matrix) ( hypre_GpuMatDataSpMVBuffer (hypre_CSRMatrixGetGPUMatData(matrix)) ) #endif void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix); diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h index de3468523..6df9e9b3f 100644 --- a/src/seq_mv/seq_mv.h +++ b/src/seq_mv/seq_mv.h @@ -36,12 +36,12 @@ extern "C" { #ifndef hypre_CSR_MATRIX_HEADER #define hypre_CSR_MATRIX_HEADER -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) struct hypre_CsrsvData; typedef struct hypre_CsrsvData hypre_CsrsvData; #endif -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) struct hypre_GpuMatData; typedef struct hypre_GpuMatData hypre_GpuMatData; #endif @@ -65,7 +65,7 @@ typedef struct HYPRE_Int *rownnz; /* for compressing rows in matrix multiplication */ HYPRE_Int num_rownnz; HYPRE_MemoryLocation memory_location; /* memory location of arrays i, j, data */ -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) HYPRE_Int *sorted_j; /* some cusparse routines require sorted CSR */ HYPRE_Complex *sorted_data; hypre_CsrsvData *csrsv_data; @@ -89,7 +89,7 @@ typedef struct #define hypre_CSRMatrixOwnsData(matrix) ((matrix) -> owns_data) #define hypre_CSRMatrixMemoryLocation(matrix) ((matrix) -> memory_location) -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) #define hypre_CSRMatrixSortedJ(matrix) ((matrix) -> sorted_j) #define hypre_CSRMatrixSortedData(matrix) ((matrix) -> sorted_data) #define hypre_CSRMatrixCsrsvData(matrix) ((matrix) -> csrsv_data) @@ -587,14 +587,16 @@ HYPRE_Int hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, hypre_Vector *b, hypre HYPRE_Int hypre_CSRMatrixSpMVDevice( HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int fill ); -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CsrsvData* hypre_CsrsvDataCreate(); void hypre_CsrsvDataDestroy(hypre_CsrsvData *data); hypre_GpuMatData* hypre_GpuMatDataCreate(); +void hypre_GPUMatDataSetCSRData(hypre_GpuMatData *data, hypre_CSRMatrix *matrix); void hypre_GpuMatDataDestroy(hypre_GpuMatData *data); hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix); #define hypre_CSRMatrixGPUMatDescr(matrix) ( hypre_GpuMatDataMatDecsr(hypre_CSRMatrixGetGPUMatData(matrix)) ) #define hypre_CSRMatrixGPUMatInfo(matrix) ( hypre_GpuMatDataMatInfo (hypre_CSRMatrixGetGPUMatData(matrix)) ) +#define hypre_CSRMatrixGPUMatHandle(matrix) ( hypre_GpuMatDataMatHandle (hypre_CSRMatrixGetGPUMatData(matrix)) ) #define hypre_CSRMatrixGPUMatSpMVBuffer(matrix) ( hypre_GpuMatDataSpMVBuffer (hypre_CSRMatrixGetGPUMatData(matrix)) ) #endif void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix); diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c index 8b024f39c..c76e2f421 100644 --- a/src/seq_mv/vector.c +++ b/src/seq_mv/vector.c @@ -11,6 +11,7 @@ * *****************************************************************************/ +#include "_hypre_onedpl.hpp" #include "seq_mv.h" #include "_hypre_utilities.hpp" //RL: TODO vector_device.c, include cuda there @@ -286,6 +287,11 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, { HYPRE_THRUST_CALL( fill_n, vector_data, size, value ); } +#elif defined(HYPRE_USING_SYCL) + if (size > 0) + { + HYPRE_ONEDPL_CALL( std::fill_n, vector_data, size, value ); + } #else HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) @@ -300,7 +306,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -467,13 +473,31 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, //hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE); +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + #if defined(HYPRE_USING_CUBLAS) HYPRE_CUBLAS_CALL( cublasDscal(hypre_HandleCublasHandle(hypre_handle()), size, &alpha, y_data, 1) ); #else HYPRE_THRUST_CALL( transform, y_data, y_data + size, y_data, alpha * _1 ); -#endif +#endif // #if defined(HYPRE_USING_CUBLAS) + +#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#if defined(HYPRE_USING_ONEMKLBLAS) + HYPRE_SYCL_CALL( oneapi::mkl::blas::scal(*hypre_HandleComputeStream(hypre_handle()), + size, alpha, + y_data, 1).wait() ); #else + HYPRE_ONEDPL_CALL( std::transform, y_data, y_data + size, + y_data, [alpha](HYPRE_Complex y) -> HYPRE_Complex { return alpha * y; } ); +#endif // #if defined(HYPRE_USING_ONEMKL) + +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) #pragma omp target teams distribute parallel for private(i) is_device_ptr(y_data) @@ -485,10 +509,10 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, y_data[i] *= alpha; } -#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -520,14 +544,32 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, //hypre_SeqVectorPrefetch(x, HYPRE_MEMORY_DEVICE); //hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE); +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + #if defined(HYPRE_USING_CUBLAS) HYPRE_CUBLAS_CALL( cublasDaxpy(hypre_HandleCublasHandle(hypre_handle()), size, &alpha, x_data, 1, y_data, 1) ); #else HYPRE_THRUST_CALL( transform, x_data, x_data + size, y_data, y_data, alpha * _1 + _2 ); -#endif +#endif // #if defined(HYPRE_USING_CUBLAS) + +#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#if defined(HYPRE_USING_ONEMKLBLAS) + HYPRE_SYCL_CALL( oneapi::mkl::blas::axpy(*hypre_HandleComputeStream(hypre_handle()), + size, alpha, + x_data, 1, y_data, 1).wait() ); #else + HYPRE_ONEDPL_CALL( std::transform, x_data, x_data + size, y_data, y_data, + [alpha](HYPRE_Complex x, HYPRE_Complex y) -> HYPRE_Complex { return alpha * x + y; } ); +#endif // #if defined(HYPRE_USING_ONEMKL) + +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) #pragma omp target teams distribute parallel for private(i) is_device_ptr(y_data, x_data) @@ -539,10 +581,10 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, y_data[i] += alpha * x_data[i]; } -#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -567,7 +609,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, HYPRE_Complex *y_data = hypre_VectorData(y); HYPRE_Int size = hypre_VectorSize(b); -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_VectorMemoryLocation(x), hypre_VectorMemoryLocation(b) ); //RL: TODO back to hypre_GetExecPolicy2 later HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE; @@ -596,7 +638,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -647,7 +689,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -678,19 +720,41 @@ hypre_SeqVectorInnerProd( hypre_Vector *x, //hypre_SeqVectorPrefetch(x, HYPRE_MEMORY_DEVICE); //hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE); -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + #ifndef HYPRE_COMPLEX + +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + #if defined(HYPRE_USING_CUBLAS) HYPRE_CUBLAS_CALL( cublasDdot(hypre_HandleCublasHandle(hypre_handle()), size, x_data, 1, y_data, 1, &result) ); #else result = HYPRE_THRUST_CALL( inner_product, x_data, x_data + size, y_data, 0.0 ); -#endif +#endif // #if defined(HYPRE_USING_CUBLAS) + +#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#if defined(HYPRE_USING_ONEMKLBLAS) + HYPRE_Real *result_dev = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); + HYPRE_SYCL_CALL( oneapi::mkl::blas::dot(*hypre_HandleComputeStream(hypre_handle()), + size, x_data, 1, + y_data, 1, result_dev).wait() ); + hypre_TMemcpy(&result, result_dev, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + hypre_TFree(result_dev, HYPRE_MEMORY_DEVICE); #else + result = HYPRE_ONEDPL_CALL( std::transform_reduce, x_data, x_data + size, y_data, 0.0 ); +#endif // #if defined(HYPRE_USING_ONEMKLBLAS) + +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +#else // #ifndef HYPRE_COMPLEX /* TODO */ #error "Complex inner product" -#endif -#else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ +#endif // #ifndef HYPRE_COMPLEX + +#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) + HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) #pragma omp target teams distribute parallel for private(i) reduction(+:result) is_device_ptr(y_data,x_data) map(result) @@ -701,10 +765,11 @@ hypre_SeqVectorInnerProd( hypre_Vector *x, { result += hypre_conj(y_data[i]) * x_data[i]; } -#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ + +#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -806,7 +871,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); diff --git a/src/utilities/Makefile b/src/utilities/Makefile index 0850a98d5..67a90b839 100644 --- a/src/utilities/Makefile +++ b/src/utilities/Makefile @@ -30,6 +30,7 @@ HEADERS =\ mpistubs.h\ threading.h\ timing.h\ + _hypre_onedpl.hpp\ _hypre_utilities.h\ _hypre_utilities.hpp @@ -81,6 +82,7 @@ SONAME = libHYPRE_utilities-${HYPRE_RELEASE_VERSION}${HYPRE_LIB_SUFFIX} all: libHYPRE_utilities${HYPRE_LIB_SUFFIX} cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include + cp -fR $(srcdir)/_hypre_onedpl.hpp $(HYPRE_BUILD_DIR)/include cp -fR $(srcdir)/_hypre_utilities.h $(HYPRE_BUILD_DIR)/include cp -fR $(srcdir)/_hypre_utilities.hpp $(HYPRE_BUILD_DIR)/include cp -fR $(srcdir)/fortran*.h $(HYPRE_BUILD_DIR)/include diff --git a/src/utilities/_hypre_onedpl.hpp b/src/utilities/_hypre_onedpl.hpp new file mode 100644 index 000000000..ecb5c2155 --- /dev/null +++ b/src/utilities/_hypre_onedpl.hpp @@ -0,0 +1,21 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +#include "HYPRE_config.h" + +#if defined(HYPRE_USING_SYCL) + +/* oneAPI DPL headers */ +/* NOTE: these must be included before standard C++ headers */ + +#include +#include +#include +#include +#include + +#endif diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 0df44e6be..a2b3bc401 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1740,7 +1740,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle); HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, @@ -1753,6 +1753,13 @@ HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v); HYPRE_Int hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + #endif HYPRE_Int hypre_CurandUniform( HYPRE_Int n, HYPRE_Real *urand, HYPRE_Int set_seed, @@ -1775,7 +1782,7 @@ char *hypre_strcpy(char *destination, const char *source); HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); HYPRE_Int hypre_RestoreSyncCudaCompute(); HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse ); diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index fd0f76b5c..afbcbb641 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -104,6 +104,14 @@ struct hypre_device_allocator #include +#if defined(HYPRE_USING_ROCSPARSE) +#include +#endif + +#if defined(HYPRE_USING_ROCRAND) +#include +#endif + /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * sycl includes * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -112,17 +120,18 @@ struct hypre_device_allocator /* WM: problems with this being inside extern C++ {} */ /* #include */ +#if defined(HYPRE_USING_ONEMKLSPARSE) +#include +#endif +#if defined(HYPRE_USING_ONEMKLBLAS) +#include +#endif +#if defined(HYPRE_USING_ONEMKLRAND) +#include +#endif #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) -#if defined(HYPRE_USING_ROCSPARSE) -#include -#endif - -#if defined(HYPRE_USING_ROCRAND) -#include -#endif - /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * macros for wrapping cuda/hip/sycl calls for error reporting * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -164,6 +173,29 @@ struct hypre_device_allocator assert(0); exit(1); \ } +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy( \ + *hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); + +#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { \ + cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) { (kernel_name)(item, __VA_ARGS__); \ + }); \ + }).wait_and_throw(); \ + } \ +} + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUBLAS_CALL(call) do { \ @@ -365,6 +397,10 @@ struct hypre_CsrsvData #elif defined(HYPRE_USING_ROCSPARSE) rocsparse_mat_info info_L; rocsparse_mat_info info_U; +#elif defined(HYPRE_USING_ONEMKLSPARSE) + /* WM: todo - placeholders */ + char info_L; + char info_U; #endif hypre_int BufferSize; char *Buffer; @@ -386,10 +422,15 @@ struct hypre_GpuMatData rocsparse_mat_descr mat_descr; rocsparse_mat_info mat_info; #endif + +#if defined(HYPRE_USING_ONEMKLSPARSE) + oneapi::mkl::sparse::matrix_handle_t mat_handle; +#endif }; #define hypre_GpuMatDataMatDecsr(data) ((data) -> mat_descr) #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) +#define hypre_GpuMatDataMatHandle(data) ((data) -> mat_handle) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) #endif //#if defined(HYPRE_USING_GPU) @@ -443,9 +484,9 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC @@ -1027,16 +1068,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index f00a23415..41ce8e3ad 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -5,6 +5,7 @@ * SPDX-License-Identifier: (Apache-2.0 OR MIT) ******************************************************************************/ +#include "_hypre_onedpl.hpp" #include "_hypre_utilities.h" #include "_hypre_utilities.hpp" @@ -42,7 +43,156 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, return gDim; } -#endif +void +hypreSYCLKernel_IVAXPY(sycl::nd_item<1>& item, + HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y ) +{ + HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id(); + + if (i < n) + { + y[i] += x[i] / a[i]; + } +} + +/* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */ +HYPRE_Int +hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y) +{ + /* trivial case */ + if (n <= 0) + { + return hypre_error_flag; + } + + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); + + HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y ); + + return hypre_error_flag; +} + +void +hypreSYCLKernel_IVAXPYMarked(sycl::nd_item<1>& item, + HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, + HYPRE_Int *marker, HYPRE_Int marker_val) +{ + HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id(); + + if (i < n) + { + if (marker[i] == marker_val) + { + y[i] += x[i] / a[i]; + } + } +} + +/* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */ +HYPRE_Int +hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, + HYPRE_Int *marker, HYPRE_Int marker_val) +{ + /* trivial case */ + if (n <= 0) + { + return hypre_error_flag; + } + + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); + + HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); + + return hypre_error_flag; +} + +HYPRE_Int* +hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return NULL; + } + + HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); + + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind); + + return d_row_ind; +} + +void +hypreSYCLKernel_ScatterRowPtr(sycl::nd_item<1>& item, + HYPRE_Int nrows, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind) +{ + HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id(); + + if (i < nrows) + { + HYPRE_Int row_start = d_row_ptr[i]; + HYPRE_Int row_end = d_row_ptr[i + 1]; + if (row_start != row_end) + { + d_row_ind[row_start] = i; + } + } +} + +HYPRE_Int +hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return hypre_error_flag; + } + + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nrows, "thread", bDim); + + HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 ); + + HYPRE_SYCL_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind ); + + HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, + oneapi::dpl::maximum()); + + return hypre_error_flag; +} + +HYPRE_Int* +hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind) +{ + HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); + + oneapi::dpl::counting_iterator count(0); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + count, + count + nrows + 1, + d_row_ptr); + + return d_row_ptr; +} + +HYPRE_Int +hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr) +{ + oneapi::dpl::counting_iterator count(0); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + count, + count + nrows + 1, + d_row_ptr); + + return hypre_error_flag; +} +#endif // #if defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -1381,6 +1531,17 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); #elif defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipDeviceSynchronize() ); +#elif defined(HYPRE_USING_SYCL) + try + { + HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() ); + } + catch (sycl::exception const &exc) + { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } #endif return hypre_error_flag; } @@ -1404,9 +1565,9 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle) * 4: sync stream based on cuda_compute_stream_sync */ HYPRE_Int -hypre_SyncCudaComputeStream_core(HYPRE_Int action, - hypre_Handle *hypre_handle, - HYPRE_Int *cuda_compute_stream_sync_ptr) +hypre_SyncComputeStream_core(HYPRE_Int action, + hypre_Handle *hypre_handle, + HYPRE_Int *cuda_compute_stream_sync_ptr) { /* with UVM the default is to sync at kernel completions, since host is also able to * touch GPU memory */ @@ -1448,7 +1609,7 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int action, #endif break; default: - hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n"); + hypre_printf("hypre_SyncComputeStream_core invalid action\n"); hypre_error_in_arg(1); } @@ -1460,7 +1621,7 @@ hypre_SetSyncCudaCompute(HYPRE_Int action) { /* convert to 1/0 */ action = action != 0; - hypre_SyncCudaComputeStream_core(action, NULL, NULL); + hypre_SyncComputeStream_core(action, NULL, NULL); return hypre_error_flag; } @@ -1468,7 +1629,7 @@ hypre_SetSyncCudaCompute(HYPRE_Int action) HYPRE_Int hypre_RestoreSyncCudaCompute() { - hypre_SyncCudaComputeStream_core(2, NULL, NULL); + hypre_SyncComputeStream_core(2, NULL, NULL); return hypre_error_flag; } @@ -1476,15 +1637,15 @@ hypre_RestoreSyncCudaCompute() HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr) { - hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr); + hypre_SyncComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr); return hypre_error_flag; } HYPRE_Int -hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle) +hypre_SyncComputeStream(hypre_Handle *hypre_handle) { - hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL); + hypre_SyncComputeStream_core(4, hypre_handle, NULL); return hypre_error_flag; } diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 9270ec0ae..34932d533 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -47,6 +47,14 @@ #include +#if defined(HYPRE_USING_ROCSPARSE) +#include +#endif + +#if defined(HYPRE_USING_ROCRAND) +#include +#endif + /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * sycl includes * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -55,17 +63,18 @@ /* WM: problems with this being inside extern C++ {} */ /* #include */ +#if defined(HYPRE_USING_ONEMKLSPARSE) +#include +#endif +#if defined(HYPRE_USING_ONEMKLBLAS) +#include +#endif +#if defined(HYPRE_USING_ONEMKLRAND) +#include +#endif #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) -#if defined(HYPRE_USING_ROCSPARSE) -#include -#endif - -#if defined(HYPRE_USING_ROCRAND) -#include -#endif - /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * macros for wrapping cuda/hip/sycl calls for error reporting * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ @@ -107,6 +116,29 @@ assert(0); exit(1); \ } +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy( \ + *hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); + +#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { \ + cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) { (kernel_name)(item, __VA_ARGS__); \ + }); \ + }).wait_and_throw(); \ + } \ +} + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUBLAS_CALL(call) do { \ @@ -308,6 +340,10 @@ struct hypre_CsrsvData #elif defined(HYPRE_USING_ROCSPARSE) rocsparse_mat_info info_L; rocsparse_mat_info info_U; +#elif defined(HYPRE_USING_ONEMKLSPARSE) + /* WM: todo - placeholders */ + char info_L; + char info_U; #endif hypre_int BufferSize; char *Buffer; @@ -329,10 +365,15 @@ struct hypre_GpuMatData rocsparse_mat_descr mat_descr; rocsparse_mat_info mat_info; #endif + +#if defined(HYPRE_USING_ONEMKLSPARSE) + oneapi::mkl::sparse::matrix_handle_t mat_handle; +#endif }; #define hypre_GpuMatDataMatDecsr(data) ((data) -> mat_descr) #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) +#define hypre_GpuMatDataMatHandle(data) ((data) -> mat_handle) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) #endif //#if defined(HYPRE_USING_GPU) @@ -386,9 +427,9 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC @@ -970,16 +1011,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); diff --git a/src/utilities/int_array.c b/src/utilities/int_array.c index 7a51fbb80..cea320105 100644 --- a/src/utilities/int_array.c +++ b/src/utilities/int_array.c @@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncComputeStream(hypre_handle()); #endif return ierr; diff --git a/src/utilities/protos.h b/src/utilities/protos.h index eb41f9984..b7f7cfea7 100644 --- a/src/utilities/protos.h +++ b/src/utilities/protos.h @@ -269,7 +269,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle); HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, @@ -282,6 +282,13 @@ HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v); HYPRE_Int hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + #endif HYPRE_Int hypre_CurandUniform( HYPRE_Int n, HYPRE_Real *urand, HYPRE_Int set_seed, @@ -304,7 +311,7 @@ char *hypre_strcpy(char *destination, const char *source); HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); HYPRE_Int hypre_RestoreSyncCudaCompute(); HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );