SYCL support for AMG solve phase (#549)
This adds matvec, matrix transpose, and vector operations (axpy, inner product, etc.) with sycl backend (via oneMKL and oneDPL) for running on Intel GPUs. Thus, the AMG solve phase can now execute entirely on Intel GPUs.
This commit is contained in:
parent
b159c7dd58
commit
a7bb784a45
@ -113,6 +113,10 @@ option(HYPRE_ENABLE_CUBLAS "Use cuBLAS" OFF)
|
||||
option(HYPRE_ENABLE_CURAND "Use cuRAND" ON)
|
||||
option(HYPRE_ENABLE_GPU_PROFILING "Use NVTX on CUDA" OFF)
|
||||
set(HYPRE_CUDA_SM "70" CACHE STRING "Target CUDA architecture.")
|
||||
# oneAPI options
|
||||
option(HYPRE_ENABLE_ONEMKLSPARSE "Use oneMKL sparse" ON)
|
||||
option(HYPRE_ENABLE_ONEMKLBLAS "Use oneMKL blas" ON)
|
||||
option(HYPRE_ENABLE_ONEMKLRAND "Use oneMKL rand" ON)
|
||||
|
||||
option(TPL_DSUPERLU_LIBRARIES "List of absolute paths to SuperLU_Dist link libraries [].")
|
||||
option(TPL_DSUPERLU_INCLUDE_DIRS "List of absolute paths to SuperLU_Dist include directories [].")
|
||||
@ -421,6 +425,26 @@ if (HYPRE_USING_CUDA)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (HYPRE_USING_SYCL)
|
||||
if (HYPRE_ENABLE_ONEMKLSPARSE)
|
||||
set(HYPRE_USING_ONEMKLSPARSE ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if (HYPRE_ENABLE_ONEMKLBLAS)
|
||||
set(HYPRE_USING_ONEMKLBLAS ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if (HYPRE_ENABLE_ONEMKLRAND)
|
||||
set(HYPRE_USING_ONEMKLRAND ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if (HYPRE_USING_ONEMKLSPARSE OR HYPRE_USING_ONEMKLBLAS OR HYPRE_USING_ONEMKLRAND)
|
||||
set(MKL_LINK static)
|
||||
set(MKL_THREADING sequential)
|
||||
find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}/lib/cmake/mkl")
|
||||
target_compile_options(${PROJECT_NAME} PUBLIC $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
|
||||
target_include_directories(${PROJECT_NAME} PUBLIC $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC $<LINK_ONLY:MKL::MKL_DPCPP>)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (HYPRE_USING_CALIPER)
|
||||
find_package(caliper REQUIRED)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC caliper)
|
||||
|
||||
@ -282,7 +282,10 @@ hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_MemoryLocation m
|
||||
hypre_ParCSRMatrixInitialize_v2(par_matrix, memory_location);
|
||||
hypre_AuxParCSRMatrixInitialize_v2(aux_matrix, memory_location_aux);
|
||||
|
||||
if (memory_location_aux == HYPRE_MEMORY_HOST)
|
||||
/* WM: TODO - implement for sycl... is this available for other non-cuda/hip gpu implementations? */
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
if (hypre_GetExecPolicy1(memory_location_aux) == HYPRE_EXEC_HOST)
|
||||
#endif
|
||||
{
|
||||
if (hypre_AuxParCSRMatrixDiagSizes(aux_matrix))
|
||||
{
|
||||
|
||||
@ -264,12 +264,15 @@ hypre_AuxParCSRMatrixInitialize_v2( hypre_AuxParCSRMatrix *matrix,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* WM: Q - added the macro guards here (since IJ assembly not yet ported to sycl)... is this OK/correct? */
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
if (memory_location != HYPRE_MEMORY_HOST)
|
||||
{
|
||||
/* GPU assembly */
|
||||
hypre_AuxParCSRMatrixNeedAux(matrix) = 1;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* CPU assembly */
|
||||
/* allocate stash for setting or adding off processor values */
|
||||
|
||||
@ -36,6 +36,9 @@ set(HYPRE_ENABLE_DEVICE_POOL @HYPRE_ENABLE_DEVICE_POOL@)
|
||||
set(HYPRE_ENABLE_CUBLAS @HYPRE_ENABLE_CUBLAS@)
|
||||
set(HYPRE_ENABLE_CURAND @HYPRE_ENABLE_CURAND@)
|
||||
set(HYPRE_ENABLE_GPU_PROFILING @HYPRE_ENABLE_GPU_PROFILING@)
|
||||
set(HYPRE_ENABLE_ONEMKLSPARSE @HYPRE_ENABLE_ONEMKLSPARSE@)
|
||||
set(HYPRE_ENABLE_ONEMKLBLAS @HYPRE_ENABLE_ONEMKLBLAS@)
|
||||
set(HYPRE_ENABLE_ONEMKLRAND @HYPRE_ENABLE_ONEMKLRAND@)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
|
||||
|
||||
|
||||
@ -103,6 +103,15 @@
|
||||
/* Use NVTX */
|
||||
#cmakedefine HYPRE_USING_NVTX 1
|
||||
|
||||
/* Use oneMLK spasre */
|
||||
#cmakedefine HYPRE_USING_ONEMKLSPARSE 1
|
||||
|
||||
/* Use oneMLK blas */
|
||||
#cmakedefine HYPRE_USING_ONEMKLBLAS 1
|
||||
|
||||
/* Use oneMLK rand */
|
||||
#cmakedefine HYPRE_USING_ONEMKLRAND 1
|
||||
|
||||
/* Use SuperLU_Dist */
|
||||
#cmakedefine HYPRE_USING_DSUPERLU 1
|
||||
|
||||
|
||||
@ -211,6 +211,7 @@ hypre_using_onemklsparse=no
|
||||
hypre_using_onemklblas=no
|
||||
hypre_using_onemklrand=no
|
||||
|
||||
hypre_found_mkl=no
|
||||
|
||||
dnl *********************************************************************
|
||||
dnl * Initialize flag-check variables
|
||||
@ -519,6 +520,39 @@ AS_HELP_STRING([--enable-gpu-aware-mpi],
|
||||
[hypre_gpu_mpi=no]
|
||||
)
|
||||
|
||||
AC_ARG_ENABLE(onemklsparse,
|
||||
AS_HELP_STRING([--enable-onemklsparse],
|
||||
[Use oneMKL sparse (default is YES).]),
|
||||
[case "${enableval}" in
|
||||
yes) hypre_using_onemklsparse=yes ;;
|
||||
no) hypre_using_onemklsparse=no ;;
|
||||
*) hypre_using_onemklsparse=yes ;;
|
||||
esac],
|
||||
[hypre_using_onemklsparse=yes]
|
||||
)
|
||||
|
||||
AC_ARG_ENABLE(onemklblas,
|
||||
AS_HELP_STRING([--enable-onemklblas],
|
||||
[Use oneMKL blas (default is YES).]),
|
||||
[case "${enableval}" in
|
||||
yes) hypre_using_onemklblas=yes ;;
|
||||
no) hypre_using_onemklblas=no ;;
|
||||
*) hypre_using_onemklblas=yes ;;
|
||||
esac],
|
||||
[hypre_using_onemklblas=yes]
|
||||
)
|
||||
|
||||
AC_ARG_ENABLE(onemklrand,
|
||||
AS_HELP_STRING([--enable-onemklrand],
|
||||
[Use oneMKL rand (default is YES).]),
|
||||
[case "${enableval}" in
|
||||
yes) hypre_using_onemklrand=yes ;;
|
||||
no) hypre_using_onemklrand=no ;;
|
||||
*) hypre_using_onemklrand=yes ;;
|
||||
esac],
|
||||
[hypre_using_onemklrand=yes]
|
||||
)
|
||||
|
||||
dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set
|
||||
dnl * Need to change to a new approach that always defines variable to some value
|
||||
|
||||
@ -2311,45 +2345,37 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"],
|
||||
AC_DEFINE(HYPRE_USING_GPU, 1, [Define to 1 if executing on GPU device])
|
||||
AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used])
|
||||
|
||||
dnl (Ab)Using CUCC when compiling HIP
|
||||
LINK_CC=${CUCC}
|
||||
LINK_CXX=${CUCC}
|
||||
|
||||
if test "$hypre_user_chose_cuflags" = "no"
|
||||
SYCLFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
|
||||
if test "$hypre_using_debug" = "yes"
|
||||
then
|
||||
CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
|
||||
if test "$hypre_using_debug" = "yes"
|
||||
then
|
||||
CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
|
||||
else
|
||||
CUFLAGS="-g -O3 ${CUFLAGS}"
|
||||
fi
|
||||
SYCLFLAGS="-Wall -g ${SYCLFLAGS}"
|
||||
else
|
||||
SYCLFLAGS="-g -O3 ${SYCLFLAGS}"
|
||||
fi
|
||||
|
||||
dnl (Ab)Use CUFLAGS to capture SYCL compilation flags
|
||||
if test "$hypre_user_chose_cuflags" = "no"
|
||||
then
|
||||
CUFLAGS="${SYCLFLAGS}"
|
||||
fi
|
||||
|
||||
dnl If not in debug mode, at least -O2, but the user can override with
|
||||
dnl with SYCLFLAGS on the configure line. If in debug mode, -O0 -Wall
|
||||
dnl plus flags for debugging symbols
|
||||
AS_IF([test x"$hypre_using_debug" == x"yes"],
|
||||
[SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"],
|
||||
[SYCLFLAGS="-O2 ${SYCLFLAGS}"],)
|
||||
|
||||
AS_IF([test x"$hypre_using_onemklsparse" == x"yes"],
|
||||
[AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used])
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp"
|
||||
AS_IF([test x"$hypre_using_onemklsparse" == x"yes" || test x"$hypre_using_onemklblas" == x"yes" || test x"$hypre_using_onemklrand" == x"yes"],
|
||||
[AC_CHECK_HEADERS(["${MKLROOT}/include/mkl.h"],
|
||||
[hypre_found_mkl=yes],
|
||||
AC_MSG_ERROR([unable to find oneMKL ... Ensure that MKLROOT is set]))
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} ${MKLROOT}/lib/intel64/libmkl_sycl.a -Wl,-export-dynamic -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_sequential.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -lpthread -lm -ldl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include"
|
||||
])
|
||||
|
||||
AS_IF([test x"$hypre_using_onemklsparse" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used])])
|
||||
|
||||
AS_IF([test x"$hypre_using_onemklblas" == x"yes"],
|
||||
[AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used])
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
|
||||
])
|
||||
AS_IF([test x"$hypre_using_onemklblas" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used])])
|
||||
|
||||
AS_IF([test x"$hypre_using_onemklrand" == x"yes"],
|
||||
[AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used])
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp"
|
||||
])
|
||||
AS_IF([test x"$hypre_using_onemklrand" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used])])
|
||||
|
||||
]) dnl AS_IF([test x"$hypre_using_sycl" == x"yes"]
|
||||
|
||||
|
||||
95
src/configure
vendored
95
src/configure
vendored
@ -792,6 +792,9 @@ enable_rocblas
|
||||
enable_rocrand
|
||||
enable_gpu_profiling
|
||||
enable_gpu_aware_mpi
|
||||
enable_onemklsparse
|
||||
enable_onemklblas
|
||||
enable_onemklrand
|
||||
with_LD
|
||||
with_LDFLAGS
|
||||
with_extra_CFLAGS
|
||||
@ -1520,6 +1523,9 @@ Optional Features:
|
||||
--enable-rocrand Use rocRAND (default is YES).
|
||||
--enable-gpu-profiling Use NVTX on CUDA, rocTX on HIP (default is NO).
|
||||
--enable-gpu-aware-mpi Use GPU memory aware MPI
|
||||
--enable-onemklsparse Use oneMKL sparse (default is YES).
|
||||
--enable-onemklblas Use oneMKL blas (default is YES).
|
||||
--enable-onemklrand Use oneMKL rand (default is YES).
|
||||
|
||||
Optional Packages:
|
||||
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
|
||||
@ -2819,6 +2825,7 @@ hypre_using_onemklsparse=no
|
||||
hypre_using_onemklblas=no
|
||||
hypre_using_onemklrand=no
|
||||
|
||||
hypre_found_mkl=no
|
||||
|
||||
hypre_blas_lib_old_style=no
|
||||
hypre_blas_lib_dir_old_style=no
|
||||
@ -3247,6 +3254,45 @@ else
|
||||
fi
|
||||
|
||||
|
||||
# Check whether --enable-onemklsparse was given.
|
||||
if test "${enable_onemklsparse+set}" = set; then :
|
||||
enableval=$enable_onemklsparse; case "${enableval}" in
|
||||
yes) hypre_using_onemklsparse=yes ;;
|
||||
no) hypre_using_onemklsparse=no ;;
|
||||
*) hypre_using_onemklsparse=yes ;;
|
||||
esac
|
||||
else
|
||||
hypre_using_onemklsparse=yes
|
||||
|
||||
fi
|
||||
|
||||
|
||||
# Check whether --enable-onemklblas was given.
|
||||
if test "${enable_onemklblas+set}" = set; then :
|
||||
enableval=$enable_onemklblas; case "${enableval}" in
|
||||
yes) hypre_using_onemklblas=yes ;;
|
||||
no) hypre_using_onemklblas=no ;;
|
||||
*) hypre_using_onemklblas=yes ;;
|
||||
esac
|
||||
else
|
||||
hypre_using_onemklblas=yes
|
||||
|
||||
fi
|
||||
|
||||
|
||||
# Check whether --enable-onemklrand was given.
|
||||
if test "${enable_onemklrand+set}" = set; then :
|
||||
enableval=$enable_onemklrand; case "${enableval}" in
|
||||
yes) hypre_using_onemklrand=yes ;;
|
||||
no) hypre_using_onemklrand=no ;;
|
||||
*) hypre_using_onemklrand=yes ;;
|
||||
esac
|
||||
else
|
||||
hypre_using_onemklrand=yes
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if test "x$CC" = "x"
|
||||
then
|
||||
@ -9138,24 +9184,40 @@ $as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h
|
||||
$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
|
||||
|
||||
|
||||
LINK_CC=${CUCC}
|
||||
LINK_CC=${CUCC}
|
||||
LINK_CXX=${CUCC}
|
||||
|
||||
if test "$hypre_user_chose_cuflags" = "no"
|
||||
SYCLFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
|
||||
if test "$hypre_using_debug" = "yes"
|
||||
then
|
||||
CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
|
||||
if test "$hypre_using_debug" = "yes"
|
||||
then
|
||||
CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
|
||||
else
|
||||
CUFLAGS="-g -O3 ${CUFLAGS}"
|
||||
fi
|
||||
SYCLFLAGS="-Wall -g ${SYCLFLAGS}"
|
||||
else
|
||||
SYCLFLAGS="-g -O3 ${SYCLFLAGS}"
|
||||
fi
|
||||
|
||||
if test "$hypre_user_chose_cuflags" = "no"
|
||||
then
|
||||
CUFLAGS="${SYCLFLAGS}"
|
||||
fi
|
||||
|
||||
if test x"$hypre_using_debug" == x"yes"; then :
|
||||
SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"
|
||||
elif SYCLFLAGS="-O2 ${SYCLFLAGS}"; then :
|
||||
if test x"$hypre_using_onemklsparse" == x"yes" || test x"$hypre_using_onemklblas" == x"yes" || test x"$hypre_using_onemklrand" == x"yes"; then :
|
||||
for ac_header in "${MKLROOT}/include/mkl.h"
|
||||
do :
|
||||
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
|
||||
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
|
||||
if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
|
||||
_ACEOF
|
||||
hypre_found_mkl=yes
|
||||
else
|
||||
as_fn_error $? "unable to find oneMKL ... Ensure that MKLROOT is set" "$LINENO" 5
|
||||
fi
|
||||
|
||||
done
|
||||
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} ${MKLROOT}/lib/intel64/libmkl_sycl.a -Wl,-export-dynamic -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_sequential.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -lpthread -lm -ldl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include"
|
||||
|
||||
fi
|
||||
|
||||
@ -9163,27 +9225,18 @@ fi
|
||||
|
||||
$as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h
|
||||
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp"
|
||||
|
||||
fi
|
||||
|
||||
if test x"$hypre_using_onemklblas" == x"yes"; then :
|
||||
|
||||
$as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h
|
||||
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
|
||||
|
||||
fi
|
||||
|
||||
if test x"$hypre_using_onemklrand" == x"yes"; then :
|
||||
|
||||
$as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h
|
||||
|
||||
HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
|
||||
HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp"
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix,
|
||||
|
||||
// RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
return(ierr);
|
||||
|
||||
@ -459,7 +459,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
|
||||
nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
|
||||
|
||||
//hypre_SyncCudaComputeStream(hypre_handle());
|
||||
//hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
@ -632,25 +632,24 @@ hypre_BoomerAMGBuildModPartialExtInterp( hypre_ParCSRMatrix *A,
|
||||
hypre_GpuProfilingPushRange("PartialExtInterp");
|
||||
#endif
|
||||
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
|
||||
HYPRE_Int ierr = 0;
|
||||
|
||||
if (exec == HYPRE_EXEC_HOST)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtInterpDevice(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtInterpHost(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
num_functions, dof_func,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
else
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtInterpDevice(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_GpuProfilingPopRange();
|
||||
@ -1338,25 +1337,24 @@ hypre_BoomerAMGBuildModPartialExtPEInterp( hypre_ParCSRMatrix *A,
|
||||
hypre_GpuProfilingPushRange("PartialExtPEInterp");
|
||||
#endif
|
||||
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
|
||||
HYPRE_Int ierr = 0;
|
||||
|
||||
if (exec == HYPRE_EXEC_HOST)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtPEInterpDevice(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtPEInterpHost(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
num_functions, dof_func,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
else
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModPartialExtPEInterpDevice(A, CF_marker, S, num_cpts_global,
|
||||
num_old_cpts_global,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_GpuProfilingPopRange();
|
||||
|
||||
@ -217,7 +217,7 @@ hypre_BoomerAMGDD_FAC_Jacobi( void *amgdd_vdata,
|
||||
HYPRE_Int level,
|
||||
HYPRE_Int cycle_param )
|
||||
{
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
hypre_ParAMGDDData *amgdd_data = (hypre_ParAMGDDData*) amgdd_vdata;
|
||||
hypre_AMGDDCompGrid *compGrid = hypre_ParAMGDDDataCompGrid(amgdd_data)[level];
|
||||
HYPRE_MemoryLocation memory_location = hypre_AMGDDCompGridMemoryLocation(compGrid);
|
||||
@ -558,7 +558,7 @@ hypre_BoomerAMGDD_FAC_CFL1Jacobi( void *amgdd_vdata,
|
||||
HYPRE_Int level,
|
||||
HYPRE_Int cycle_param )
|
||||
{
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
hypre_ParAMGDDData *amgdd_data = (hypre_ParAMGDDData*) amgdd_vdata;
|
||||
hypre_AMGDDCompGrid *compGrid = hypre_ParAMGDDDataCompGrid(amgdd_data)[level];
|
||||
HYPRE_MemoryLocation memory_location = hypre_AMGDDCompGridMemoryLocation(compGrid);
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include "_hypre_parcsr_ls.h"
|
||||
#include "_hypre_utilities.hpp"
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
HYPRE_Int
|
||||
hypre_BoomerAMGDD_FAC_JacobiDevice( void *amgdd_vdata,
|
||||
@ -149,4 +149,4 @@ hypre_BoomerAMGDD_FAC_CFL1JacobiDevice( void *amgdd_vdata,
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
@ -554,23 +554,22 @@ hypre_BoomerAMGBuildModExtInterp(hypre_ParCSRMatrix *A,
|
||||
hypre_GpuProfilingPushRange("ModExtInterp");
|
||||
#endif
|
||||
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
|
||||
HYPRE_Int ierr = 0;
|
||||
|
||||
if (exec == HYPRE_EXEC_HOST)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildExtInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModExtInterpHost(A, CF_marker, S, num_cpts_global, num_functions,
|
||||
dof_func,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
else
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildExtInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_GpuProfilingPopRange();
|
||||
@ -1869,23 +1868,22 @@ hypre_BoomerAMGBuildModExtPEInterp(hypre_ParCSRMatrix *A,
|
||||
hypre_GpuProfilingPushRange("ModExtPEInterp");
|
||||
#endif
|
||||
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
|
||||
HYPRE_Int ierr = 0;
|
||||
|
||||
if (exec == HYPRE_EXEC_HOST)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildExtPEInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildModExtPEInterpHost(A, CF_marker, S, num_cpts_global,
|
||||
num_functions, dof_func,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
else
|
||||
{
|
||||
ierr = hypre_BoomerAMGBuildExtPEInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
|
||||
debug_flag, trunc_factor, max_elmts, P_ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_GpuProfilingPopRange();
|
||||
|
||||
@ -122,8 +122,8 @@ hypre_BoomerAMGBuildModMultipassHost( hypre_ParCSRMatrix *A,
|
||||
pass_starts = hypre_CTAlloc(HYPRE_Int, 10, HYPRE_MEMORY_HOST);
|
||||
/* contains beginning for each pass in pass_order field, assume no more than 10 passes */
|
||||
|
||||
P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST);
|
||||
P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST);
|
||||
P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE);
|
||||
P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
cnt = 0;
|
||||
remaining = 0;
|
||||
@ -351,10 +351,10 @@ hypre_BoomerAMGBuildModMultipassHost( hypre_ParCSRMatrix *A,
|
||||
P_offd_i[i + 1] += P_offd_i[i];
|
||||
}
|
||||
|
||||
P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_i[n_fine], HYPRE_MEMORY_HOST);
|
||||
P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_i[n_fine], HYPRE_MEMORY_HOST);
|
||||
P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_i[n_fine], HYPRE_MEMORY_HOST);
|
||||
P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_i[n_fine], HYPRE_MEMORY_HOST);
|
||||
P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_i[n_fine], HYPRE_MEMORY_DEVICE);
|
||||
P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_i[n_fine], HYPRE_MEMORY_DEVICE);
|
||||
P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_i[n_fine], HYPRE_MEMORY_DEVICE);
|
||||
P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_i[n_fine], HYPRE_MEMORY_DEVICE);
|
||||
|
||||
/* insert weights for coarse points */
|
||||
for (i = 0; i < pass_starts[1]; i++)
|
||||
|
||||
@ -255,8 +255,8 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix *A,
|
||||
if (pass_array_size) { pass_array = hypre_CTAlloc(HYPRE_Int, pass_array_size, HYPRE_MEMORY_HOST); }
|
||||
pass_pointer = hypre_CTAlloc(HYPRE_Int, max_num_passes + 1, HYPRE_MEMORY_HOST);
|
||||
if (n_fine) { assigned = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
|
||||
P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST);
|
||||
P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_HOST);
|
||||
P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE);
|
||||
P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine + 1, HYPRE_MEMORY_DEVICE);
|
||||
if (n_coarse) { C_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); }
|
||||
|
||||
if (num_cols_offd)
|
||||
@ -1138,14 +1138,14 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix *A,
|
||||
hypre_TFree(cnt_nz_offd_per_thread, HYPRE_MEMORY_HOST);
|
||||
hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST);
|
||||
|
||||
P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_HOST);
|
||||
P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_HOST);
|
||||
P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_DEVICE);
|
||||
P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
|
||||
if (total_nz_offd)
|
||||
{
|
||||
P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_HOST);
|
||||
P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_HOST);
|
||||
P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_DEVICE);
|
||||
P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_DEVICE);
|
||||
}
|
||||
|
||||
for (i = 0; i < n_fine; i++)
|
||||
|
||||
@ -1145,7 +1145,7 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A,
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_SetSyncCudaCompute(sync_stream);
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
return hypre_error_flag;
|
||||
|
||||
@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
|
||||
rowsums_upper,
|
||||
scale);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0,
|
||||
thrust::minimum<HYPRE_Real>());
|
||||
@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A, /* matrix to relax
|
||||
/* set residual to random */
|
||||
hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
HYPRE_THRUST_CALL(transform,
|
||||
r_data, r_data + local_size, r_data,
|
||||
|
||||
@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job,
|
||||
recv_data = recv_data_in;
|
||||
// TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI.
|
||||
// Need to check MPI documentation whether this is acutally true
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
num_requests = num_sends + num_recvs;
|
||||
|
||||
@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix *A,
|
||||
if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE )
|
||||
{
|
||||
hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C));
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A)
|
||||
hypre_CSRMatrixData(B) = B_a;
|
||||
hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE;
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return B;
|
||||
}
|
||||
@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix *mat,
|
||||
*values = hypre_ParCSRMatrixRowvalues(mat);
|
||||
}
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA,
|
||||
HYPRE_Int ierr = 0;
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data);
|
||||
//hypre_SyncCudaComputeStream(hypre_handle());
|
||||
//hypre_SyncComputeStream(hypre_handle());
|
||||
#else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
HYPRE_Int i;
|
||||
#if defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
@ -349,7 +349,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha,
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SetSyncCudaCompute(sync_stream);
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -725,7 +725,7 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha,
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SetSyncCudaCompute(sync_stream);
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
|
||||
@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix *A,
|
||||
|
||||
hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return C;
|
||||
}
|
||||
@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R,
|
||||
|
||||
hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
@ -1269,7 +1269,7 @@ hypre_CSRMatrixTranspose(hypre_CSRMatrix *A,
|
||||
{
|
||||
HYPRE_Int ierr = 0;
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) );
|
||||
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
|
||||
@ -15,7 +15,7 @@
|
||||
#include "_hypre_utilities.hpp"
|
||||
#include "seq_mv.hpp"
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_CsrsvData*
|
||||
hypre_CsrsvDataCreate()
|
||||
{
|
||||
@ -82,9 +82,33 @@ hypre_GpuMatDataCreate()
|
||||
hypre_GpuMatDataMatInfo(data) = info;
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
oneapi::mkl::sparse::matrix_handle_t mat_handle;
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::sparse::init_matrix_handle(&mat_handle) );
|
||||
hypre_GpuMatDataMatHandle(data) = mat_handle;
|
||||
#endif
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void
|
||||
hypre_GPUMatDataSetCSRData( hypre_GpuMatData *data,
|
||||
hypre_CSRMatrix *matrix)
|
||||
{
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
oneapi::mkl::sparse::matrix_handle_t mat_handle = hypre_GpuMatDataMatHandle(data);
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::sparse::set_csr_data(mat_handle,
|
||||
hypre_CSRMatrixNumRows(matrix),
|
||||
hypre_CSRMatrixNumCols(matrix),
|
||||
oneapi::mkl::index_base::zero,
|
||||
hypre_CSRMatrixI(matrix),
|
||||
hypre_CSRMatrixJ(matrix),
|
||||
hypre_CSRMatrixData(matrix)) );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
hypre_GpuMatDataDestroy(hypre_GpuMatData *data)
|
||||
{
|
||||
@ -103,6 +127,10 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data)
|
||||
HYPRE_ROCSPARSE_CALL( rocsparse_destroy_mat_info(hypre_GpuMatDataMatInfo(data)) );
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::sparse::release_matrix_handle(&hypre_GpuMatDataMatHandle(data)) );
|
||||
#endif
|
||||
|
||||
hypre_TFree(data, HYPRE_MEMORY_HOST);
|
||||
}
|
||||
|
||||
@ -151,7 +179,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha,
|
||||
hypre_CSRMatrixData(C) = C_data;
|
||||
hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return C;
|
||||
}
|
||||
@ -174,7 +202,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
|
||||
|
||||
hypreDevice_CSRSpGemm(A, B, &C);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return C;
|
||||
}
|
||||
@ -326,7 +354,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext,
|
||||
*B_ext_diag_ptr = B_ext_diag;
|
||||
*B_ext_offd_ptr = B_ext_offd;
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return ierr;
|
||||
}
|
||||
@ -576,7 +604,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
|
||||
hypre_CSRMatrixData(C) = C_data;
|
||||
hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return C;
|
||||
}
|
||||
@ -618,7 +646,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A,
|
||||
hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
|
||||
hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -681,7 +709,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A )
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
|
||||
nrows, A_i, A_j, A_data);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -725,7 +753,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
|
||||
|
||||
hypre_TFree(result, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return ierr;
|
||||
}
|
||||
@ -822,7 +850,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
|
||||
hypre_TFree(result, HYPRE_MEMORY_DEVICE);
|
||||
#endif
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return ierr;
|
||||
}
|
||||
@ -917,7 +945,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
|
||||
hypre_TFree(result, HYPRE_MEMORY_DEVICE);
|
||||
#endif
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return ierr;
|
||||
}
|
||||
@ -1113,7 +1141,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
|
||||
row_sum, scal, set_or_add[0] == 's' );
|
||||
}
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
}
|
||||
|
||||
/* type 0: diag
|
||||
@ -1206,7 +1234,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
}
|
||||
|
||||
/* return C = [A; B] */
|
||||
@ -1530,9 +1558,6 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A,
|
||||
#elif defined(HYPRE_USING_ROCSPARSE)
|
||||
hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
|
||||
data);
|
||||
#elif defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
|
||||
data);
|
||||
#else
|
||||
hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
|
||||
#endif
|
||||
@ -1546,7 +1571,7 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A,
|
||||
|
||||
*AT_ptr = C;
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows,
|
||||
/* set defaults */
|
||||
hypre_CSRMatrixOwnsData(matrix) = 1;
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_CSRMatrixSortedJ(matrix) = NULL;
|
||||
hypre_CSRMatrixSortedData(matrix) = NULL;
|
||||
hypre_CSRMatrixCsrsvData(matrix) = NULL;
|
||||
@ -78,7 +78,7 @@ hypre_CSRMatrixDestroy( hypre_CSRMatrix *matrix )
|
||||
hypre_TFree(hypre_CSRMatrixBigJ(matrix), memory_location);
|
||||
}
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_TFree(hypre_CSRMatrixSortedData(matrix), memory_location);
|
||||
hypre_TFree(hypre_CSRMatrixSortedJ(matrix), memory_location);
|
||||
hypre_CsrsvDataDestroy(hypre_CSRMatrixCsrsvData(matrix));
|
||||
@ -1074,7 +1074,7 @@ hypre_CSRMatrixPrefetch( hypre_CSRMatrix *A, HYPRE_MemoryLocation memory_locatio
|
||||
return ierr;
|
||||
}
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_GpuMatData *
|
||||
hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix)
|
||||
{
|
||||
@ -1086,6 +1086,7 @@ hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix)
|
||||
if (!hypre_CSRMatrixGPUMatData(matrix))
|
||||
{
|
||||
hypre_CSRMatrixGPUMatData(matrix) = hypre_GpuMatDataCreate();
|
||||
hypre_GPUMatDataSetCSRData(hypre_CSRMatrixGPUMatData(matrix), matrix);
|
||||
}
|
||||
|
||||
return hypre_CSRMatrixGPUMatData(matrix);
|
||||
|
||||
@ -16,12 +16,12 @@
|
||||
#ifndef hypre_CSR_MATRIX_HEADER
|
||||
#define hypre_CSR_MATRIX_HEADER
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
struct hypre_CsrsvData;
|
||||
typedef struct hypre_CsrsvData hypre_CsrsvData;
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
struct hypre_GpuMatData;
|
||||
typedef struct hypre_GpuMatData hypre_GpuMatData;
|
||||
#endif
|
||||
@ -45,7 +45,7 @@ typedef struct
|
||||
HYPRE_Int *rownnz; /* for compressing rows in matrix multiplication */
|
||||
HYPRE_Int num_rownnz;
|
||||
HYPRE_MemoryLocation memory_location; /* memory location of arrays i, j, data */
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
HYPRE_Int *sorted_j; /* some cusparse routines require sorted CSR */
|
||||
HYPRE_Complex *sorted_data;
|
||||
hypre_CsrsvData *csrsv_data;
|
||||
@ -69,7 +69,7 @@ typedef struct
|
||||
#define hypre_CSRMatrixOwnsData(matrix) ((matrix) -> owns_data)
|
||||
#define hypre_CSRMatrixMemoryLocation(matrix) ((matrix) -> memory_location)
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
#define hypre_CSRMatrixSortedJ(matrix) ((matrix) -> sorted_j)
|
||||
#define hypre_CSRMatrixSortedData(matrix) ((matrix) -> sorted_data)
|
||||
#define hypre_CSRMatrixCsrsvData(matrix) ((matrix) -> csrsv_data)
|
||||
|
||||
@ -712,10 +712,6 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha,
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) );
|
||||
/* WM: TODO - remove after sycl implementation in place */
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
exec = HYPRE_EXEC_HOST;
|
||||
#endif
|
||||
if (exec == HYPRE_EXEC_DEVICE)
|
||||
{
|
||||
ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset);
|
||||
|
||||
@ -13,7 +13,6 @@
|
||||
|
||||
#include "seq_mv.h"
|
||||
#include "_hypre_utilities.hpp"
|
||||
#include "seq_mv.hpp"
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
|
||||
@ -53,8 +52,6 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int trans,
|
||||
hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset);
|
||||
#elif defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset);
|
||||
// WM: TODO: remove trivial HYPRE_USING_SYCL branch after onemlksparse implementation is in
|
||||
#elif defined(HYPRE_USING_SYCL)
|
||||
#else // #ifdef HYPRE_USING_CUSPARSE
|
||||
#error HYPRE SPMV TODO
|
||||
#endif
|
||||
@ -117,7 +114,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int trans,
|
||||
hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset);
|
||||
}
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
hypre_GpuProfilingPopRange();
|
||||
@ -201,7 +198,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans,
|
||||
#endif
|
||||
dBuffer) );
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
if (trans)
|
||||
{
|
||||
@ -328,7 +325,29 @@ hypre_CSRMatrixMatvecOnemklsparse( HYPRE_Int trans,
|
||||
hypre_Vector *y,
|
||||
HYPRE_Int offset )
|
||||
{
|
||||
/* WM: TODO */
|
||||
sycl::queue *compute_queue = hypre_HandleComputeStream(hypre_handle());
|
||||
hypre_CSRMatrix *AT;
|
||||
oneapi::mkl::sparse::matrix_handle_t matA_handle = hypre_CSRMatrixGPUMatHandle(A);
|
||||
|
||||
if (trans)
|
||||
{
|
||||
hypre_CSRMatrixTransposeDevice(A, &AT, 1);
|
||||
matA_handle = hypre_CSRMatrixGPUMatHandle(AT);
|
||||
}
|
||||
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::sparse::gemv(*compute_queue,
|
||||
oneapi::mkl::transpose::nontrans,
|
||||
alpha,
|
||||
matA_handle,
|
||||
hypre_VectorData(x),
|
||||
beta,
|
||||
hypre_VectorData(y) + offset).wait() );
|
||||
|
||||
if (trans)
|
||||
{
|
||||
hypre_CSRMatrixDestroy(AT);
|
||||
}
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif // #if defined(HYPRE_USING_ROCSPARSE)
|
||||
|
||||
@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
#endif
|
||||
hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("Rownnz time %f\n", t2);
|
||||
#endif
|
||||
@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
1 /* exact row nnz */,
|
||||
&d_ic, &d_jc, &d_c, &nnzC);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("SpGemmNumerical time %f\n", t2);
|
||||
#endif
|
||||
@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
#endif
|
||||
hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("RownnzEst time %f\n", t2);
|
||||
#endif
|
||||
@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc,
|
||||
&d_ic, &d_jc, &d_c, &nnzC);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("SpGemmNumerical time %f\n", t2);
|
||||
#endif
|
||||
@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
#endif
|
||||
hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("RownnzEst time %f\n", t2);
|
||||
#endif
|
||||
@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
d_rc + 2 * m,
|
||||
thrust::identity<HYPRE_Int>() );
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("RownnzBound time %f\n", t2);
|
||||
#endif
|
||||
@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A,
|
||||
rownnz_exact,
|
||||
&d_ic, &d_jc, &d_c, &nnzC);
|
||||
#ifdef HYPRE_SPGEMM_TIMING
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
t2 = hypre_MPI_Wtime() - t1;
|
||||
hypre_printf("SpGemmNumerical time %f\n", t2);
|
||||
#endif
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
* SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
******************************************************************************/
|
||||
|
||||
#include "_hypre_onedpl.hpp"
|
||||
#include "seq_mv.h"
|
||||
#include "_hypre_utilities.hpp"
|
||||
|
||||
@ -146,19 +147,6 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPR
|
||||
|
||||
#endif // #if defined(HYPRE_USING_ROCSPARSE)
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
HYPRE_Int
|
||||
hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA,
|
||||
HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa,
|
||||
HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out,
|
||||
HYPRE_Int want_data)
|
||||
{
|
||||
/* WM: TODO */
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif // #if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
HYPRE_Int
|
||||
@ -242,7 +230,79 @@ hypreDevice_CSRSpTrans(HYPRE_Int m, HYPRE_Int n, HYPRE_Int
|
||||
HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out,
|
||||
HYPRE_Int want_data)
|
||||
{
|
||||
/* WM: TODO */
|
||||
#ifdef HYPRE_PROFILE
|
||||
hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] -= hypre_MPI_Wtime();
|
||||
#endif
|
||||
|
||||
HYPRE_Int *d_jt, *d_it, *d_pm, *d_ic, *d_jc;
|
||||
HYPRE_Complex *d_ac = NULL;
|
||||
HYPRE_Int *mem_work = hypre_TAlloc(HYPRE_Int, 3 * nnzA, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
/* allocate C */
|
||||
d_jc = hypre_TAlloc(HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE);
|
||||
if (want_data)
|
||||
{
|
||||
d_ac = hypre_TAlloc(HYPRE_Complex, nnzA, HYPRE_MEMORY_DEVICE);
|
||||
}
|
||||
|
||||
/* permutation vector */
|
||||
d_pm = mem_work;
|
||||
|
||||
/* expansion: A's row idx */
|
||||
d_it = d_pm + nnzA;
|
||||
hypreDevice_CsrRowPtrsToIndices_v2(m, nnzA, d_ia, d_it);
|
||||
|
||||
/* a copy of col idx of A */
|
||||
d_jt = d_it + nnzA;
|
||||
hypre_TMemcpy(d_jt, d_ja, HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
/* sort: by col */
|
||||
oneapi::dpl::counting_iterator<HYPRE_Int> count(0);
|
||||
HYPRE_ONEDPL_CALL( std::copy,
|
||||
count,
|
||||
count + nnzA,
|
||||
d_pm);
|
||||
|
||||
auto zip_jt_pm = oneapi::dpl::make_zip_iterator(d_jt, d_pm);
|
||||
HYPRE_ONEDPL_CALL( std::stable_sort,
|
||||
zip_jt_pm,
|
||||
zip_jt_pm + nnzA,
|
||||
[](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } );
|
||||
|
||||
auto permuted_it = oneapi::dpl::make_permutation_iterator(d_it, d_pm);
|
||||
HYPRE_ONEDPL_CALL( std::copy,
|
||||
permuted_it,
|
||||
permuted_it + nnzA,
|
||||
d_jc );
|
||||
|
||||
if (want_data)
|
||||
{
|
||||
auto permuted_aa = oneapi::dpl::make_permutation_iterator(d_aa, d_pm);
|
||||
HYPRE_ONEDPL_CALL( std::copy,
|
||||
permuted_aa,
|
||||
permuted_aa + nnzA,
|
||||
d_ac );
|
||||
}
|
||||
|
||||
/* convert into ic: row idx --> row ptrs */
|
||||
d_ic = hypreDevice_CsrRowIndicesToPtrs(n, nnzA, d_jt);
|
||||
|
||||
#ifdef HYPRE_DEBUG
|
||||
HYPRE_Int nnzC;
|
||||
hypre_TMemcpy(&nnzC, &d_ic[n], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
|
||||
hypre_assert(nnzC == nnzA);
|
||||
#endif
|
||||
|
||||
hypre_TFree(mem_work, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
*d_ic_out = d_ic;
|
||||
*d_jc_out = d_jc;
|
||||
*d_ac_out = d_ac;
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime();
|
||||
#endif
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif // #if defined(HYPRE_USING_SYCL)
|
||||
|
||||
@ -315,14 +315,16 @@ HYPRE_Int hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, hypre_Vector *b, hypre
|
||||
HYPRE_Int hypre_CSRMatrixSpMVDevice( HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x,
|
||||
HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int fill );
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_CsrsvData* hypre_CsrsvDataCreate();
|
||||
void hypre_CsrsvDataDestroy(hypre_CsrsvData *data);
|
||||
hypre_GpuMatData* hypre_GpuMatDataCreate();
|
||||
void hypre_GPUMatDataSetCSRData(hypre_GpuMatData *data, hypre_CSRMatrix *matrix);
|
||||
void hypre_GpuMatDataDestroy(hypre_GpuMatData *data);
|
||||
hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix);
|
||||
#define hypre_CSRMatrixGPUMatDescr(matrix) ( hypre_GpuMatDataMatDecsr(hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatInfo(matrix) ( hypre_GpuMatDataMatInfo (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatHandle(matrix) ( hypre_GpuMatDataMatHandle (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatSpMVBuffer(matrix) ( hypre_GpuMatDataSpMVBuffer (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#endif
|
||||
void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix);
|
||||
|
||||
@ -36,12 +36,12 @@ extern "C" {
|
||||
#ifndef hypre_CSR_MATRIX_HEADER
|
||||
#define hypre_CSR_MATRIX_HEADER
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
struct hypre_CsrsvData;
|
||||
typedef struct hypre_CsrsvData hypre_CsrsvData;
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
struct hypre_GpuMatData;
|
||||
typedef struct hypre_GpuMatData hypre_GpuMatData;
|
||||
#endif
|
||||
@ -65,7 +65,7 @@ typedef struct
|
||||
HYPRE_Int *rownnz; /* for compressing rows in matrix multiplication */
|
||||
HYPRE_Int num_rownnz;
|
||||
HYPRE_MemoryLocation memory_location; /* memory location of arrays i, j, data */
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
HYPRE_Int *sorted_j; /* some cusparse routines require sorted CSR */
|
||||
HYPRE_Complex *sorted_data;
|
||||
hypre_CsrsvData *csrsv_data;
|
||||
@ -89,7 +89,7 @@ typedef struct
|
||||
#define hypre_CSRMatrixOwnsData(matrix) ((matrix) -> owns_data)
|
||||
#define hypre_CSRMatrixMemoryLocation(matrix) ((matrix) -> memory_location)
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
#define hypre_CSRMatrixSortedJ(matrix) ((matrix) -> sorted_j)
|
||||
#define hypre_CSRMatrixSortedData(matrix) ((matrix) -> sorted_data)
|
||||
#define hypre_CSRMatrixCsrsvData(matrix) ((matrix) -> csrsv_data)
|
||||
@ -587,14 +587,16 @@ HYPRE_Int hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, hypre_Vector *b, hypre
|
||||
HYPRE_Int hypre_CSRMatrixSpMVDevice( HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x,
|
||||
HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int fill );
|
||||
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
|
||||
#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
hypre_CsrsvData* hypre_CsrsvDataCreate();
|
||||
void hypre_CsrsvDataDestroy(hypre_CsrsvData *data);
|
||||
hypre_GpuMatData* hypre_GpuMatDataCreate();
|
||||
void hypre_GPUMatDataSetCSRData(hypre_GpuMatData *data, hypre_CSRMatrix *matrix);
|
||||
void hypre_GpuMatDataDestroy(hypre_GpuMatData *data);
|
||||
hypre_GpuMatData* hypre_CSRMatrixGetGPUMatData(hypre_CSRMatrix *matrix);
|
||||
#define hypre_CSRMatrixGPUMatDescr(matrix) ( hypre_GpuMatDataMatDecsr(hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatInfo(matrix) ( hypre_GpuMatDataMatInfo (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatHandle(matrix) ( hypre_GpuMatDataMatHandle (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#define hypre_CSRMatrixGPUMatSpMVBuffer(matrix) ( hypre_GpuMatDataSpMVBuffer (hypre_CSRMatrixGetGPUMatData(matrix)) )
|
||||
#endif
|
||||
void hypre_CSRMatrixGpuSpMVAnalysis(hypre_CSRMatrix *matrix);
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "_hypre_onedpl.hpp"
|
||||
#include "seq_mv.h"
|
||||
#include "_hypre_utilities.hpp" //RL: TODO vector_device.c, include cuda there
|
||||
|
||||
@ -286,6 +287,11 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v,
|
||||
{
|
||||
HYPRE_THRUST_CALL( fill_n, vector_data, size, value );
|
||||
}
|
||||
#elif defined(HYPRE_USING_SYCL)
|
||||
if (size > 0)
|
||||
{
|
||||
HYPRE_ONEDPL_CALL( std::fill_n, vector_data, size, value );
|
||||
}
|
||||
#else
|
||||
HYPRE_Int i;
|
||||
#if defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
@ -300,7 +306,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v,
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -467,13 +473,31 @@ hypre_SeqVectorScale( HYPRE_Complex alpha,
|
||||
|
||||
//hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_CUBLAS)
|
||||
HYPRE_CUBLAS_CALL( cublasDscal(hypre_HandleCublasHandle(hypre_handle()), size, &alpha, y_data, 1) );
|
||||
#else
|
||||
HYPRE_THRUST_CALL( transform, y_data, y_data + size, y_data, alpha * _1 );
|
||||
#endif
|
||||
#endif // #if defined(HYPRE_USING_CUBLAS)
|
||||
|
||||
#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::blas::scal(*hypre_HandleComputeStream(hypre_handle()),
|
||||
size, alpha,
|
||||
y_data, 1).wait() );
|
||||
#else
|
||||
HYPRE_ONEDPL_CALL( std::transform, y_data, y_data + size,
|
||||
y_data, [alpha](HYPRE_Complex y) -> HYPRE_Complex { return alpha * y; } );
|
||||
#endif // #if defined(HYPRE_USING_ONEMKL)
|
||||
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
HYPRE_Int i;
|
||||
#if defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
#pragma omp target teams distribute parallel for private(i) is_device_ptr(y_data)
|
||||
@ -485,10 +509,10 @@ hypre_SeqVectorScale( HYPRE_Complex alpha,
|
||||
y_data[i] *= alpha;
|
||||
}
|
||||
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -520,14 +544,32 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
|
||||
//hypre_SeqVectorPrefetch(x, HYPRE_MEMORY_DEVICE);
|
||||
//hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_CUBLAS)
|
||||
HYPRE_CUBLAS_CALL( cublasDaxpy(hypre_HandleCublasHandle(hypre_handle()), size, &alpha, x_data, 1,
|
||||
y_data, 1) );
|
||||
#else
|
||||
HYPRE_THRUST_CALL( transform, x_data, x_data + size, y_data, y_data, alpha * _1 + _2 );
|
||||
#endif
|
||||
#endif // #if defined(HYPRE_USING_CUBLAS)
|
||||
|
||||
#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::blas::axpy(*hypre_HandleComputeStream(hypre_handle()),
|
||||
size, alpha,
|
||||
x_data, 1, y_data, 1).wait() );
|
||||
#else
|
||||
HYPRE_ONEDPL_CALL( std::transform, x_data, x_data + size, y_data, y_data,
|
||||
[alpha](HYPRE_Complex x, HYPRE_Complex y) -> HYPRE_Complex { return alpha * x + y; } );
|
||||
#endif // #if defined(HYPRE_USING_ONEMKL)
|
||||
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
HYPRE_Int i;
|
||||
#if defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
#pragma omp target teams distribute parallel for private(i) is_device_ptr(y_data, x_data)
|
||||
@ -539,10 +581,10 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
|
||||
y_data[i] += alpha * x_data[i];
|
||||
}
|
||||
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -567,7 +609,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x,
|
||||
HYPRE_Complex *y_data = hypre_VectorData(y);
|
||||
HYPRE_Int size = hypre_VectorSize(b);
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
//HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_VectorMemoryLocation(x), hypre_VectorMemoryLocation(b) );
|
||||
//RL: TODO back to hypre_GetExecPolicy2 later
|
||||
HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE;
|
||||
@ -596,7 +638,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x,
|
||||
}
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -647,7 +689,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x,
|
||||
}
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -678,19 +720,41 @@ hypre_SeqVectorInnerProd( hypre_Vector *x,
|
||||
//hypre_SeqVectorPrefetch(x, HYPRE_MEMORY_DEVICE);
|
||||
//hypre_SeqVectorPrefetch(y, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#ifndef HYPRE_COMPLEX
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_CUBLAS)
|
||||
HYPRE_CUBLAS_CALL( cublasDdot(hypre_HandleCublasHandle(hypre_handle()), size, x_data, 1, y_data, 1,
|
||||
&result) );
|
||||
#else
|
||||
result = HYPRE_THRUST_CALL( inner_product, x_data, x_data + size, y_data, 0.0 );
|
||||
#endif
|
||||
#endif // #if defined(HYPRE_USING_CUBLAS)
|
||||
|
||||
#elif defined(HYPRE_USING_SYCL) // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
HYPRE_Real *result_dev = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);
|
||||
HYPRE_SYCL_CALL( oneapi::mkl::blas::dot(*hypre_HandleComputeStream(hypre_handle()),
|
||||
size, x_data, 1,
|
||||
y_data, 1, result_dev).wait() );
|
||||
hypre_TMemcpy(&result, result_dev, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
|
||||
hypre_TFree(result_dev, HYPRE_MEMORY_DEVICE);
|
||||
#else
|
||||
result = HYPRE_ONEDPL_CALL( std::transform_reduce, x_data, x_data + size, y_data, 0.0 );
|
||||
#endif // #if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#else // #ifndef HYPRE_COMPLEX
|
||||
/* TODO */
|
||||
#error "Complex inner product"
|
||||
#endif
|
||||
#else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
#endif // #ifndef HYPRE_COMPLEX
|
||||
|
||||
#else // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
HYPRE_Int i;
|
||||
#if defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
#pragma omp target teams distribute parallel for private(i) reduction(+:result) is_device_ptr(y_data,x_data) map(result)
|
||||
@ -701,10 +765,11 @@ hypre_SeqVectorInnerProd( hypre_Vector *x,
|
||||
{
|
||||
result += hypre_conj(y_data[i]) * x_data[i];
|
||||
}
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
|
||||
#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
@ -806,7 +871,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha,
|
||||
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
#ifdef HYPRE_PROFILE
|
||||
hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime();
|
||||
|
||||
@ -30,6 +30,7 @@ HEADERS =\
|
||||
mpistubs.h\
|
||||
threading.h\
|
||||
timing.h\
|
||||
_hypre_onedpl.hpp\
|
||||
_hypre_utilities.h\
|
||||
_hypre_utilities.hpp
|
||||
|
||||
@ -81,6 +82,7 @@ SONAME = libHYPRE_utilities-${HYPRE_RELEASE_VERSION}${HYPRE_LIB_SUFFIX}
|
||||
|
||||
all: libHYPRE_utilities${HYPRE_LIB_SUFFIX}
|
||||
cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
|
||||
cp -fR $(srcdir)/_hypre_onedpl.hpp $(HYPRE_BUILD_DIR)/include
|
||||
cp -fR $(srcdir)/_hypre_utilities.h $(HYPRE_BUILD_DIR)/include
|
||||
cp -fR $(srcdir)/_hypre_utilities.hpp $(HYPRE_BUILD_DIR)/include
|
||||
cp -fR $(srcdir)/fortran*.h $(HYPRE_BUILD_DIR)/include
|
||||
|
||||
21
src/utilities/_hypre_onedpl.hpp
Normal file
21
src/utilities/_hypre_onedpl.hpp
Normal file
@ -0,0 +1,21 @@
|
||||
/******************************************************************************
|
||||
* Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
|
||||
* HYPRE Project Developers. See the top-level COPYRIGHT file for details.
|
||||
*
|
||||
* SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
******************************************************************************/
|
||||
|
||||
#include "HYPRE_config.h"
|
||||
|
||||
#if defined(HYPRE_USING_SYCL)
|
||||
|
||||
/* oneAPI DPL headers */
|
||||
/* NOTE: these must be included before standard C++ headers */
|
||||
|
||||
#include <oneapi/dpl/execution>
|
||||
#include <oneapi/dpl/algorithm>
|
||||
#include <oneapi/dpl/numeric>
|
||||
#include <oneapi/dpl/iterator>
|
||||
#include <oneapi/dpl/functional>
|
||||
|
||||
#endif
|
||||
@ -1740,7 +1740,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
|
||||
hypre_UnorderedBigIntMap *inverse_map);
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
|
||||
@ -1753,6 +1753,13 @@ HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex
|
||||
HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v);
|
||||
HYPRE_Int hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v);
|
||||
HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v);
|
||||
HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
|
||||
HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
|
||||
HYPRE_Int *d_row_ind);
|
||||
HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
|
||||
HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
|
||||
HYPRE_Int *d_row_ptr);
|
||||
|
||||
#endif
|
||||
|
||||
HYPRE_Int hypre_CurandUniform( HYPRE_Int n, HYPRE_Real *urand, HYPRE_Int set_seed,
|
||||
@ -1775,7 +1782,7 @@ char *hypre_strcpy(char *destination, const char *source);
|
||||
HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
|
||||
HYPRE_Int hypre_RestoreSyncCudaCompute();
|
||||
HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
|
||||
HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle);
|
||||
|
||||
/* handle.c */
|
||||
HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );
|
||||
|
||||
@ -104,6 +104,14 @@ struct hypre_device_allocator
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#if defined(HYPRE_USING_ROCSPARSE)
|
||||
#include <rocsparse.h>
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ROCRAND)
|
||||
#include <rocrand.h>
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* sycl includes
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -112,17 +120,18 @@ struct hypre_device_allocator
|
||||
|
||||
/* WM: problems with this being inside extern C++ {} */
|
||||
/* #include <CL/sycl.hpp> */
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
#include <oneapi/mkl/spblas.hpp>
|
||||
#endif
|
||||
#if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
#include <oneapi/mkl/blas.hpp>
|
||||
#endif
|
||||
#if defined(HYPRE_USING_ONEMKLRAND)
|
||||
#include <oneapi/mkl/rng.hpp>
|
||||
#endif
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#if defined(HYPRE_USING_ROCSPARSE)
|
||||
#include <rocsparse.h>
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ROCRAND)
|
||||
#include <rocrand.h>
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* macros for wrapping cuda/hip/sycl calls for error reporting
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -164,6 +173,29 @@ struct hypre_device_allocator
|
||||
assert(0); exit(1); \
|
||||
}
|
||||
|
||||
#define HYPRE_ONEDPL_CALL(func_name, ...) \
|
||||
func_name(oneapi::dpl::execution::make_device_policy( \
|
||||
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
|
||||
|
||||
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
{ \
|
||||
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
|
||||
{ \
|
||||
hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \
|
||||
__FILE__, __LINE__, \
|
||||
gridsize[0], blocksize[0]); \
|
||||
assert(0); exit(1); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { \
|
||||
cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
|
||||
[=] (sycl::nd_item<1> item) { (kernel_name)(item, __VA_ARGS__); \
|
||||
}); \
|
||||
}).wait_and_throw(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#define HYPRE_CUBLAS_CALL(call) do { \
|
||||
@ -365,6 +397,10 @@ struct hypre_CsrsvData
|
||||
#elif defined(HYPRE_USING_ROCSPARSE)
|
||||
rocsparse_mat_info info_L;
|
||||
rocsparse_mat_info info_U;
|
||||
#elif defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
/* WM: todo - placeholders */
|
||||
char info_L;
|
||||
char info_U;
|
||||
#endif
|
||||
hypre_int BufferSize;
|
||||
char *Buffer;
|
||||
@ -386,10 +422,15 @@ struct hypre_GpuMatData
|
||||
rocsparse_mat_descr mat_descr;
|
||||
rocsparse_mat_info mat_info;
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
oneapi::mkl::sparse::matrix_handle_t mat_handle;
|
||||
#endif
|
||||
};
|
||||
|
||||
#define hypre_GpuMatDataMatDecsr(data) ((data) -> mat_descr)
|
||||
#define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info)
|
||||
#define hypre_GpuMatDataMatHandle(data) ((data) -> mat_handle)
|
||||
#define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer)
|
||||
|
||||
#endif //#if defined(HYPRE_USING_GPU)
|
||||
@ -443,9 +484,9 @@ using namespace thrust::placeholders;
|
||||
|
||||
#if defined(HYPRE_DEBUG)
|
||||
#if defined(HYPRE_USING_CUDA)
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); }
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); }
|
||||
#endif
|
||||
#else // #if defined(HYPRE_DEBUG)
|
||||
#define GPU_LAUNCH_SYNC
|
||||
@ -1027,16 +1068,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
|
||||
|
||||
HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
|
||||
|
||||
HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
|
||||
|
||||
HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
|
||||
HYPRE_Int *d_row_ind);
|
||||
|
||||
HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
|
||||
|
||||
HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
|
||||
HYPRE_Int *d_row_ptr);
|
||||
|
||||
HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
|
||||
char *work);
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
* SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
******************************************************************************/
|
||||
|
||||
#include "_hypre_onedpl.hpp"
|
||||
#include "_hypre_utilities.h"
|
||||
#include "_hypre_utilities.hpp"
|
||||
|
||||
@ -42,7 +43,156 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
|
||||
|
||||
return gDim;
|
||||
}
|
||||
#endif
|
||||
void
|
||||
hypreSYCLKernel_IVAXPY(sycl::nd_item<1>& item,
|
||||
HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y )
|
||||
{
|
||||
HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id();
|
||||
|
||||
if (i < n)
|
||||
{
|
||||
y[i] += x[i] / a[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
|
||||
HYPRE_Int
|
||||
hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y)
|
||||
{
|
||||
/* trivial case */
|
||||
if (n <= 0)
|
||||
{
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
void
|
||||
hypreSYCLKernel_IVAXPYMarked(sycl::nd_item<1>& item,
|
||||
HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y,
|
||||
HYPRE_Int *marker, HYPRE_Int marker_val)
|
||||
{
|
||||
HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id();
|
||||
|
||||
if (i < n)
|
||||
{
|
||||
if (marker[i] == marker_val)
|
||||
{
|
||||
y[i] += x[i] / a[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
|
||||
HYPRE_Int
|
||||
hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y,
|
||||
HYPRE_Int *marker, HYPRE_Int marker_val)
|
||||
{
|
||||
/* trivial case */
|
||||
if (n <= 0)
|
||||
{
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
HYPRE_Int*
|
||||
hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
|
||||
{
|
||||
/* trivial case */
|
||||
if (nrows <= 0 || nnz <= 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
|
||||
|
||||
return d_row_ind;
|
||||
}
|
||||
|
||||
void
|
||||
hypreSYCLKernel_ScatterRowPtr(sycl::nd_item<1>& item,
|
||||
HYPRE_Int nrows, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind)
|
||||
{
|
||||
HYPRE_Int i = (HYPRE_Int) item.get_global_linear_id();
|
||||
|
||||
if (i < nrows)
|
||||
{
|
||||
HYPRE_Int row_start = d_row_ptr[i];
|
||||
HYPRE_Int row_end = d_row_ptr[i + 1];
|
||||
if (row_start != row_end)
|
||||
{
|
||||
d_row_ind[row_start] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HYPRE_Int
|
||||
hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
|
||||
HYPRE_Int *d_row_ind)
|
||||
{
|
||||
/* trivial case */
|
||||
if (nrows <= 0 || nnz <= 0)
|
||||
{
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nrows, "thread", bDim);
|
||||
|
||||
HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind );
|
||||
|
||||
HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
|
||||
oneapi::dpl::maximum<HYPRE_Int>());
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
HYPRE_Int*
|
||||
hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
|
||||
{
|
||||
HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
oneapi::dpl::counting_iterator<HYPRE_Int> count(0);
|
||||
HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
|
||||
d_row_ind, d_row_ind + nnz,
|
||||
count,
|
||||
count + nrows + 1,
|
||||
d_row_ptr);
|
||||
|
||||
return d_row_ptr;
|
||||
}
|
||||
|
||||
HYPRE_Int
|
||||
hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
|
||||
HYPRE_Int *d_row_ptr)
|
||||
{
|
||||
oneapi::dpl::counting_iterator<HYPRE_Int> count(0);
|
||||
HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
|
||||
d_row_ind, d_row_ind + nnz,
|
||||
count,
|
||||
count + nrows + 1,
|
||||
d_row_ptr);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
#endif // #if defined(HYPRE_USING_SYCL)
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
@ -1381,6 +1531,17 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
|
||||
HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
HYPRE_HIP_CALL( hipDeviceSynchronize() );
|
||||
#elif defined(HYPRE_USING_SYCL)
|
||||
try
|
||||
{
|
||||
HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() );
|
||||
}
|
||||
catch (sycl::exception const &exc)
|
||||
{
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
#endif
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -1404,9 +1565,9 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle)
|
||||
* 4: sync stream based on cuda_compute_stream_sync
|
||||
*/
|
||||
HYPRE_Int
|
||||
hypre_SyncCudaComputeStream_core(HYPRE_Int action,
|
||||
hypre_Handle *hypre_handle,
|
||||
HYPRE_Int *cuda_compute_stream_sync_ptr)
|
||||
hypre_SyncComputeStream_core(HYPRE_Int action,
|
||||
hypre_Handle *hypre_handle,
|
||||
HYPRE_Int *cuda_compute_stream_sync_ptr)
|
||||
{
|
||||
/* with UVM the default is to sync at kernel completions, since host is also able to
|
||||
* touch GPU memory */
|
||||
@ -1448,7 +1609,7 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int action,
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n");
|
||||
hypre_printf("hypre_SyncComputeStream_core invalid action\n");
|
||||
hypre_error_in_arg(1);
|
||||
}
|
||||
|
||||
@ -1460,7 +1621,7 @@ hypre_SetSyncCudaCompute(HYPRE_Int action)
|
||||
{
|
||||
/* convert to 1/0 */
|
||||
action = action != 0;
|
||||
hypre_SyncCudaComputeStream_core(action, NULL, NULL);
|
||||
hypre_SyncComputeStream_core(action, NULL, NULL);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -1468,7 +1629,7 @@ hypre_SetSyncCudaCompute(HYPRE_Int action)
|
||||
HYPRE_Int
|
||||
hypre_RestoreSyncCudaCompute()
|
||||
{
|
||||
hypre_SyncCudaComputeStream_core(2, NULL, NULL);
|
||||
hypre_SyncComputeStream_core(2, NULL, NULL);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -1476,15 +1637,15 @@ hypre_RestoreSyncCudaCompute()
|
||||
HYPRE_Int
|
||||
hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr)
|
||||
{
|
||||
hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
|
||||
hypre_SyncComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
HYPRE_Int
|
||||
hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle)
|
||||
hypre_SyncComputeStream(hypre_Handle *hypre_handle)
|
||||
{
|
||||
hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL);
|
||||
hypre_SyncComputeStream_core(4, hypre_handle, NULL);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
@ -47,6 +47,14 @@
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#if defined(HYPRE_USING_ROCSPARSE)
|
||||
#include <rocsparse.h>
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ROCRAND)
|
||||
#include <rocrand.h>
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* sycl includes
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -55,17 +63,18 @@
|
||||
|
||||
/* WM: problems with this being inside extern C++ {} */
|
||||
/* #include <CL/sycl.hpp> */
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
#include <oneapi/mkl/spblas.hpp>
|
||||
#endif
|
||||
#if defined(HYPRE_USING_ONEMKLBLAS)
|
||||
#include <oneapi/mkl/blas.hpp>
|
||||
#endif
|
||||
#if defined(HYPRE_USING_ONEMKLRAND)
|
||||
#include <oneapi/mkl/rng.hpp>
|
||||
#endif
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#if defined(HYPRE_USING_ROCSPARSE)
|
||||
#include <rocsparse.h>
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ROCRAND)
|
||||
#include <rocrand.h>
|
||||
#endif
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
* macros for wrapping cuda/hip/sycl calls for error reporting
|
||||
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||||
@ -107,6 +116,29 @@
|
||||
assert(0); exit(1); \
|
||||
}
|
||||
|
||||
#define HYPRE_ONEDPL_CALL(func_name, ...) \
|
||||
func_name(oneapi::dpl::execution::make_device_policy( \
|
||||
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
|
||||
|
||||
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
{ \
|
||||
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
|
||||
{ \
|
||||
hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \
|
||||
__FILE__, __LINE__, \
|
||||
gridsize[0], blocksize[0]); \
|
||||
assert(0); exit(1); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { \
|
||||
cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
|
||||
[=] (sycl::nd_item<1> item) { (kernel_name)(item, __VA_ARGS__); \
|
||||
}); \
|
||||
}).wait_and_throw(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#define HYPRE_CUBLAS_CALL(call) do { \
|
||||
@ -308,6 +340,10 @@ struct hypre_CsrsvData
|
||||
#elif defined(HYPRE_USING_ROCSPARSE)
|
||||
rocsparse_mat_info info_L;
|
||||
rocsparse_mat_info info_U;
|
||||
#elif defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
/* WM: todo - placeholders */
|
||||
char info_L;
|
||||
char info_U;
|
||||
#endif
|
||||
hypre_int BufferSize;
|
||||
char *Buffer;
|
||||
@ -329,10 +365,15 @@ struct hypre_GpuMatData
|
||||
rocsparse_mat_descr mat_descr;
|
||||
rocsparse_mat_info mat_info;
|
||||
#endif
|
||||
|
||||
#if defined(HYPRE_USING_ONEMKLSPARSE)
|
||||
oneapi::mkl::sparse::matrix_handle_t mat_handle;
|
||||
#endif
|
||||
};
|
||||
|
||||
#define hypre_GpuMatDataMatDecsr(data) ((data) -> mat_descr)
|
||||
#define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info)
|
||||
#define hypre_GpuMatDataMatHandle(data) ((data) -> mat_handle)
|
||||
#define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer)
|
||||
|
||||
#endif //#if defined(HYPRE_USING_GPU)
|
||||
@ -386,9 +427,9 @@ using namespace thrust::placeholders;
|
||||
|
||||
#if defined(HYPRE_DEBUG)
|
||||
#if defined(HYPRE_USING_CUDA)
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
|
||||
#elif defined(HYPRE_USING_HIP)
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); }
|
||||
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); }
|
||||
#endif
|
||||
#else // #if defined(HYPRE_DEBUG)
|
||||
#define GPU_LAUNCH_SYNC
|
||||
@ -970,16 +1011,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
|
||||
|
||||
HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
|
||||
|
||||
HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
|
||||
|
||||
HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
|
||||
HYPRE_Int *d_row_ind);
|
||||
|
||||
HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
|
||||
|
||||
HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
|
||||
HYPRE_Int *d_row_ptr);
|
||||
|
||||
HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
|
||||
char *work);
|
||||
|
||||
|
||||
@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v,
|
||||
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
hypre_SyncCudaComputeStream(hypre_handle());
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
#endif
|
||||
|
||||
return ierr;
|
||||
|
||||
@ -269,7 +269,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
|
||||
hypre_UnorderedBigIntMap *inverse_map);
|
||||
|
||||
#if defined(HYPRE_USING_GPU)
|
||||
HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
|
||||
@ -282,6 +282,13 @@ HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex
|
||||
HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v);
|
||||
HYPRE_Int hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v);
|
||||
HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v);
|
||||
HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
|
||||
HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
|
||||
HYPRE_Int *d_row_ind);
|
||||
HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
|
||||
HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
|
||||
HYPRE_Int *d_row_ptr);
|
||||
|
||||
#endif
|
||||
|
||||
HYPRE_Int hypre_CurandUniform( HYPRE_Int n, HYPRE_Real *urand, HYPRE_Int set_seed,
|
||||
@ -304,7 +311,7 @@ char *hypre_strcpy(char *destination, const char *source);
|
||||
HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
|
||||
HYPRE_Int hypre_RestoreSyncCudaCompute();
|
||||
HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
|
||||
HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
|
||||
HYPRE_Int hypre_SyncComputeStream(hypre_Handle *hypre_handle);
|
||||
|
||||
/* handle.c */
|
||||
HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );
|
||||
|
||||
Loading…
Reference in New Issue
Block a user