955 lines
32 KiB
C
955 lines
32 KiB
C
/******************************************************************************
|
|
* Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
|
|
* HYPRE Project Developers. See the top-level COPYRIGHT file for details.
|
|
*
|
|
* SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
|
******************************************************************************/
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Matvec functions for hypre_CSRMatrix class.
|
|
*
|
|
*****************************************************************************/
|
|
|
|
#include "_hypre_onedpl.hpp"
|
|
#include "_hypre_parcsr_mv.h"
|
|
#include "_hypre_utilities.hpp" //RL: TODO par_csr_matvec_device.c, include cuda there
|
|
|
|
/*--------------------------------------------------------------------------
|
|
* hypre_ParCSRMatrixMatvec
|
|
*--------------------------------------------------------------------------*/
|
|
// y = alpha*A*x + beta*b
|
|
HYPRE_Int
|
|
hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha,
|
|
hypre_ParCSRMatrix *A,
|
|
hypre_ParVector *x,
|
|
HYPRE_Complex beta,
|
|
hypre_ParVector *b,
|
|
hypre_ParVector *y )
|
|
{
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
hypre_GpuProfilingPushRange("Matvec");
|
|
#endif
|
|
|
|
hypre_ParCSRCommHandle **comm_handle;
|
|
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
|
|
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
|
|
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
|
|
|
|
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
|
|
hypre_Vector *b_local = hypre_ParVectorLocalVector(b);
|
|
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
|
|
hypre_Vector *x_tmp;
|
|
|
|
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
|
|
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
|
|
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
|
|
HYPRE_BigInt b_size = hypre_ParVectorGlobalSize(b);
|
|
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
|
|
|
|
HYPRE_Int num_vectors = hypre_VectorNumVectors(x_local);
|
|
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
|
|
HYPRE_Int ierr = 0;
|
|
HYPRE_Int num_sends, jv;
|
|
|
|
HYPRE_Int vecstride = hypre_VectorVectorStride( x_local );
|
|
HYPRE_Int idxstride = hypre_VectorIndexStride( x_local );
|
|
|
|
HYPRE_Complex *x_tmp_data, **x_buf_data;
|
|
HYPRE_Complex *x_local_data = hypre_VectorData(x_local);
|
|
|
|
#if defined(HYPRE_USING_GPU)
|
|
HYPRE_Int sync_stream;
|
|
hypre_GetSyncCudaCompute(&sync_stream);
|
|
hypre_SetSyncCudaCompute(0);
|
|
#endif
|
|
|
|
HYPRE_ANNOTATE_FUNC_BEGIN;
|
|
|
|
/*---------------------------------------------------------------------
|
|
* Check for size compatibility. ParMatvec returns ierr = 11 if
|
|
* length of X doesn't equal the number of columns of A,
|
|
* ierr = 12 if the length of Y doesn't equal the number of rows
|
|
* of A, and ierr = 13 if both are true.
|
|
*
|
|
* Because temporary vectors are often used in ParMatvec, none of
|
|
* these conditions terminates processing, and the ierr flag
|
|
* is informational only.
|
|
*--------------------------------------------------------------------*/
|
|
hypre_assert( idxstride > 0 );
|
|
|
|
if (num_cols != x_size)
|
|
{
|
|
ierr = 11;
|
|
}
|
|
|
|
if (num_rows != y_size || num_rows != b_size)
|
|
{
|
|
ierr = 12;
|
|
}
|
|
|
|
if (num_cols != x_size && (num_rows != y_size || num_rows != b_size))
|
|
{
|
|
ierr = 13;
|
|
}
|
|
|
|
hypre_assert( hypre_VectorNumVectors(b_local) == num_vectors );
|
|
hypre_assert( hypre_VectorNumVectors(y_local) == num_vectors );
|
|
|
|
if ( num_vectors == 1 )
|
|
{
|
|
x_tmp = hypre_SeqVectorCreate( num_cols_offd );
|
|
}
|
|
else
|
|
{
|
|
hypre_assert( num_vectors > 1 );
|
|
x_tmp = hypre_SeqMultiVectorCreate( num_cols_offd, num_vectors );
|
|
}
|
|
|
|
/*---------------------------------------------------------------------
|
|
* If there exists no CommPkg for A, a CommPkg is generated using
|
|
* equally load balanced partitionings
|
|
*--------------------------------------------------------------------*/
|
|
if (!comm_pkg)
|
|
{
|
|
hypre_MatvecCommPkgCreate(A);
|
|
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
}
|
|
|
|
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
|
|
hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg,
|
|
hypre_ParCSRCommPkgNumRecvs(comm_pkg)) );
|
|
hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 );
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
HYPRE_Int use_persistent_comm = 0;
|
|
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
use_persistent_comm = num_vectors == 1;
|
|
// JSP TODO: we can use persistent communication for multi-vectors,
|
|
// but then we need different communication handles for different
|
|
// num_vectors.
|
|
hypre_ParCSRPersistentCommHandle *persistent_comm_handle;
|
|
#endif
|
|
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(1, comm_pkg);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle*, num_vectors, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
/* x_tmp */
|
|
#if defined(HYPRE_USING_GPU)
|
|
/* for GPU and single vector, alloc persistent memory for x_tmp (in comm_pkg) and reuse */
|
|
if (num_vectors == 1)
|
|
{
|
|
if (!hypre_ParCSRCommPkgTmpData(comm_pkg))
|
|
{
|
|
#if 1
|
|
hypre_ParCSRCommPkgTmpData(comm_pkg) = hypre_TAlloc(HYPRE_Complex, num_cols_offd,
|
|
HYPRE_MEMORY_DEVICE);
|
|
#else
|
|
hypre_ParCSRCommPkgTmpData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex, num_cols_offd,
|
|
hypre_MEMORY_DEVICE);
|
|
#endif
|
|
}
|
|
hypre_VectorData(x_tmp) = hypre_ParCSRCommPkgTmpData(comm_pkg);
|
|
hypre_SeqVectorSetDataOwner(x_tmp, 0);
|
|
}
|
|
#else
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_VectorData(x_tmp) = (HYPRE_Complex *) hypre_ParCSRCommHandleRecvDataBuffer(
|
|
persistent_comm_handle);
|
|
hypre_SeqVectorSetDataOwner(x_tmp, 0);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
hypre_SeqVectorInitialize_v2(x_tmp, HYPRE_MEMORY_DEVICE);
|
|
x_tmp_data = hypre_VectorData(x_tmp);
|
|
|
|
/* x_buff_data */
|
|
x_buf_data = hypre_CTAlloc(HYPRE_Complex*, num_vectors, HYPRE_MEMORY_HOST);
|
|
|
|
for (jv = 0; jv < num_vectors; ++jv)
|
|
{
|
|
#if defined(HYPRE_USING_GPU)
|
|
if (jv == 0)
|
|
{
|
|
if (!hypre_ParCSRCommPkgBufData(comm_pkg))
|
|
{
|
|
#if 1
|
|
hypre_ParCSRCommPkgBufData(comm_pkg) = hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
HYPRE_MEMORY_DEVICE);
|
|
#else
|
|
hypre_ParCSRCommPkgBufData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
hypre_MEMORY_DEVICE);
|
|
#endif
|
|
}
|
|
x_buf_data[0] = hypre_ParCSRCommPkgBufData(comm_pkg);
|
|
continue;
|
|
}
|
|
#endif
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
x_buf_data[0] = (HYPRE_Complex *) hypre_ParCSRCommHandleSendDataBuffer(persistent_comm_handle);
|
|
continue;
|
|
#endif
|
|
}
|
|
|
|
x_buf_data[jv] = hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
HYPRE_MEMORY_DEVICE);
|
|
}
|
|
|
|
/* The assert is because the following loop only works for 'column'
|
|
storage of a multivector. This needs to be fixed to work more generally,
|
|
at least for 'row' storage. This in turn, means either change CommPkg so
|
|
num_sends is no.zones*no.vectors (not no.zones) or, less dangerously, put
|
|
a stride in the logic of CommHandleCreate (stride either from a new arg or
|
|
a new variable inside CommPkg). Or put the num_vector iteration inside
|
|
CommHandleCreate (perhaps a new multivector variant of it).
|
|
*/
|
|
|
|
hypre_assert( idxstride == 1 );
|
|
|
|
//hypre_SeqVectorPrefetch(x_local, HYPRE_MEMORY_DEVICE);
|
|
|
|
/* send_map_elmts on device */
|
|
hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
|
|
|
|
for (jv = 0; jv < num_vectors; ++jv)
|
|
{
|
|
HYPRE_Complex *send_data = (HYPRE_Complex *) x_buf_data[jv];
|
|
HYPRE_Complex *locl_data = x_local_data + jv * vecstride;
|
|
|
|
/* if on device, no need to Sync: send_data is on device memory */
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
/* pack send data on device */
|
|
HYPRE_THRUST_CALL( gather,
|
|
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
|
|
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
locl_data,
|
|
send_data );
|
|
#elif defined(HYPRE_USING_SYCL)
|
|
auto permuted_source = oneapi::dpl::make_permutation_iterator(locl_data,
|
|
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg));
|
|
HYPRE_ONEDPL_CALL( std::copy,
|
|
permuted_source,
|
|
permuted_source + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
send_data );
|
|
#elif defined(HYPRE_USING_DEVICE_OPENMP)
|
|
/* pack send data on device */
|
|
HYPRE_Int i;
|
|
HYPRE_Int *device_send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
|
|
HYPRE_Int start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
|
|
HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
|
|
#pragma omp target teams distribute parallel for private(i) is_device_ptr(send_data, locl_data, device_send_map_elmts)
|
|
for (i = start; i < end; i++)
|
|
{
|
|
send_data[i] = locl_data[device_send_map_elmts[i]];
|
|
}
|
|
#else
|
|
HYPRE_Int i;
|
|
/* pack send data on host */
|
|
#if defined(HYPRE_USING_OPENMP)
|
|
#pragma omp parallel for HYPRE_SMP_SCHEDULE
|
|
#endif
|
|
for (i = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
|
|
i < hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
|
|
i ++)
|
|
{
|
|
send_data[i] = locl_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i)];
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
#if defined(HYPRE_USING_GPU) && defined(HYPRE_WITH_GPU_AWARE_MPI) && THRUST_CALL_BLOCKING == 0
|
|
/* RL: make sure x_buf_data is ready before issuing GPU-GPU MPI */
|
|
hypre_ForceSyncComputeStream(hypre_handle());
|
|
#endif
|
|
|
|
/* when using GPUs, start local matvec first in order to overlap with communication */
|
|
#if defined(HYPRE_USING_GPU)
|
|
hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0 );
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
/* nonblocking communication starts */
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_ParCSRPersistentCommHandleStart(persistent_comm_handle, HYPRE_MEMORY_DEVICE, x_buf_data[0]);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 1, comm_pkg, HYPRE_MEMORY_DEVICE, x_buf_data[jv],
|
|
HYPRE_MEMORY_DEVICE, &x_tmp_data[jv * num_cols_offd] );
|
|
}
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
#if !defined(HYPRE_USING_GPU)
|
|
/* overlapped local computation */
|
|
hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0 );
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
/* nonblocking communication ends */
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_ParCSRPersistentCommHandleWait(persistent_comm_handle, HYPRE_MEMORY_DEVICE, x_tmp_data);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
hypre_ParCSRCommHandleDestroy(comm_handle[jv]);
|
|
comm_handle[jv] = NULL;
|
|
}
|
|
hypre_TFree(comm_handle, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
/* computation offd part */
|
|
if (num_cols_offd)
|
|
{
|
|
hypre_CSRMatrixMatvec( alpha, offd, x_tmp, 1.0, y_local );
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
hypre_SeqVectorDestroy(x_tmp); x_tmp = NULL;
|
|
|
|
if (!use_persistent_comm)
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
#if defined(HYPRE_USING_GPU)
|
|
if (jv == 0)
|
|
{
|
|
continue;
|
|
}
|
|
#endif
|
|
hypre_TFree(x_buf_data[jv], HYPRE_MEMORY_DEVICE);
|
|
}
|
|
hypre_TFree(x_buf_data, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
#if defined(HYPRE_USING_GPU)
|
|
hypre_SetSyncCudaCompute(sync_stream);
|
|
hypre_SyncComputeStream(hypre_handle());
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
HYPRE_ANNOTATE_FUNC_END;
|
|
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
hypre_GpuProfilingPopRange();
|
|
#endif
|
|
|
|
return ierr;
|
|
}
|
|
|
|
HYPRE_Int
|
|
hypre_ParCSRMatrixMatvec( HYPRE_Complex alpha,
|
|
hypre_ParCSRMatrix *A,
|
|
hypre_ParVector *x,
|
|
HYPRE_Complex beta,
|
|
hypre_ParVector *y )
|
|
{
|
|
return hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A, x, beta, y, y);
|
|
}
|
|
|
|
/*--------------------------------------------------------------------------
|
|
* hypre_ParCSRMatrixMatvecT
|
|
*
|
|
* Performs y <- alpha * A^T * x + beta * y
|
|
*
|
|
*--------------------------------------------------------------------------*/
|
|
HYPRE_Int
|
|
hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha,
|
|
hypre_ParCSRMatrix *A,
|
|
hypre_ParVector *x,
|
|
HYPRE_Complex beta,
|
|
hypre_ParVector *y )
|
|
{
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
hypre_GpuProfilingPushRange("MatvecT");
|
|
#endif
|
|
|
|
hypre_ParCSRCommHandle **comm_handle;
|
|
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
|
|
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
|
|
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
|
|
hypre_CSRMatrix *diagT = hypre_ParCSRMatrixDiagT(A);
|
|
hypre_CSRMatrix *offdT = hypre_ParCSRMatrixOffdT(A);
|
|
|
|
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
|
|
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
|
|
hypre_Vector *y_tmp;
|
|
|
|
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
|
|
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
|
|
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
|
|
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
|
|
|
|
HYPRE_Int num_vectors = hypre_VectorNumVectors(y_local);
|
|
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
|
|
HYPRE_Int ierr = 0;
|
|
HYPRE_Int num_sends, jv;
|
|
|
|
HYPRE_Int vecstride = hypre_VectorVectorStride(y_local);
|
|
HYPRE_Int idxstride = hypre_VectorIndexStride(y_local);
|
|
|
|
HYPRE_Complex *y_tmp_data, **y_buf_data;
|
|
HYPRE_Complex *y_local_data = hypre_VectorData(y_local);
|
|
|
|
#if defined(HYPRE_USING_GPU)
|
|
HYPRE_Int sync_stream;
|
|
hypre_GetSyncCudaCompute(&sync_stream);
|
|
hypre_SetSyncCudaCompute(0);
|
|
#endif
|
|
|
|
HYPRE_ANNOTATE_FUNC_BEGIN;
|
|
|
|
/*---------------------------------------------------------------------
|
|
* Check for size compatibility. MatvecT returns ierr = 1 if
|
|
* length of X doesn't equal the number of rows of A,
|
|
* ierr = 2 if the length of Y doesn't equal the number of
|
|
* columns of A, and ierr = 3 if both are true.
|
|
*
|
|
* Because temporary vectors are often used in MatvecT, none of
|
|
* these conditions terminates processing, and the ierr flag
|
|
* is informational only.
|
|
*--------------------------------------------------------------------*/
|
|
if (num_rows != x_size)
|
|
{
|
|
ierr = 1;
|
|
}
|
|
|
|
if (num_cols != y_size)
|
|
{
|
|
ierr = 2;
|
|
}
|
|
|
|
if (num_rows != x_size && num_cols != y_size)
|
|
{
|
|
ierr = 3;
|
|
}
|
|
|
|
hypre_assert( hypre_VectorNumVectors(x_local) == num_vectors );
|
|
hypre_assert( hypre_VectorNumVectors(y_local) == num_vectors );
|
|
|
|
if ( num_vectors == 1 )
|
|
{
|
|
y_tmp = hypre_SeqVectorCreate(num_cols_offd);
|
|
}
|
|
else
|
|
{
|
|
hypre_assert( num_vectors > 1 );
|
|
y_tmp = hypre_SeqMultiVectorCreate(num_cols_offd, num_vectors);
|
|
}
|
|
|
|
/*---------------------------------------------------------------------
|
|
* If there exists no CommPkg for A, a CommPkg is generated using
|
|
* equally load balanced partitionings
|
|
*--------------------------------------------------------------------*/
|
|
if (!comm_pkg)
|
|
{
|
|
hypre_MatvecCommPkgCreate(A);
|
|
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
}
|
|
|
|
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
|
|
hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg,
|
|
hypre_ParCSRCommPkgNumRecvs(comm_pkg)) );
|
|
hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 );
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
HYPRE_Int use_persistent_comm = 0;
|
|
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
use_persistent_comm = num_vectors == 1;
|
|
// JSP TODO: we can use persistent communication for multi-vectors,
|
|
// but then we need different communication handles for different
|
|
// num_vectors.
|
|
hypre_ParCSRPersistentCommHandle *persistent_comm_handle;
|
|
#endif
|
|
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(2, comm_pkg);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle*, num_vectors, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
/* y_tmp */
|
|
#if defined(HYPRE_USING_GPU)
|
|
/* for GPU and single vector, alloc persistent memory for y_tmp (in comm_pkg) and reuse */
|
|
if (num_vectors == 1)
|
|
{
|
|
if (!hypre_ParCSRCommPkgTmpData(comm_pkg))
|
|
{
|
|
#if 1
|
|
hypre_ParCSRCommPkgTmpData(comm_pkg) = hypre_TAlloc(HYPRE_Complex, num_cols_offd,
|
|
HYPRE_MEMORY_DEVICE);
|
|
#else
|
|
hypre_ParCSRCommPkgTmpData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex, num_cols_offd,
|
|
hypre_MEMORY_DEVICE);
|
|
#endif
|
|
}
|
|
hypre_VectorData(y_tmp) = hypre_ParCSRCommPkgTmpData(comm_pkg);
|
|
hypre_SeqVectorSetDataOwner(y_tmp, 0);
|
|
}
|
|
#else
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_VectorData(y_tmp) = (HYPRE_Complex *) hypre_ParCSRCommHandleSendDataBuffer(
|
|
persistent_comm_handle);
|
|
hypre_SeqVectorSetDataOwner(y_tmp, 0);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
hypre_SeqVectorInitialize_v2(y_tmp, HYPRE_MEMORY_DEVICE);
|
|
y_tmp_data = hypre_VectorData(y_tmp);
|
|
|
|
/* y_buf_data */
|
|
y_buf_data = hypre_CTAlloc(HYPRE_Complex*, num_vectors, HYPRE_MEMORY_HOST);
|
|
|
|
for (jv = 0; jv < num_vectors; ++jv)
|
|
{
|
|
#if defined(HYPRE_USING_GPU)
|
|
if (jv == 0)
|
|
{
|
|
if (!hypre_ParCSRCommPkgBufData(comm_pkg))
|
|
{
|
|
#if 1
|
|
hypre_ParCSRCommPkgBufData(comm_pkg) = hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
HYPRE_MEMORY_DEVICE);
|
|
#else
|
|
hypre_ParCSRCommPkgBufData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
hypre_MEMORY_DEVICE);
|
|
#endif
|
|
}
|
|
y_buf_data[0] = hypre_ParCSRCommPkgBufData(comm_pkg);
|
|
continue;
|
|
}
|
|
#endif
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
y_buf_data[0] = (HYPRE_Complex *) hypre_ParCSRCommHandleRecvDataBuffer(persistent_comm_handle);
|
|
continue;
|
|
#endif
|
|
}
|
|
|
|
y_buf_data[jv] = hypre_TAlloc(HYPRE_Complex,
|
|
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
|
|
HYPRE_MEMORY_DEVICE);
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
if (num_cols_offd)
|
|
{
|
|
if (offdT)
|
|
{
|
|
// offdT is optional. Used only if it's present
|
|
hypre_CSRMatrixMatvec(alpha, offdT, x_local, 0.0, y_tmp);
|
|
}
|
|
else
|
|
{
|
|
hypre_CSRMatrixMatvecT(alpha, offd, x_local, 0.0, y_tmp);
|
|
}
|
|
}
|
|
|
|
#if defined(HYPRE_USING_GPU) && defined(HYPRE_WITH_GPU_AWARE_MPI)
|
|
/* RL: make sure y_tmp is ready before issuing GPU-GPU MPI */
|
|
hypre_ForceSyncComputeStream(hypre_handle());
|
|
#endif
|
|
|
|
/* when using GPUs, start local matvec first in order to overlap with communication */
|
|
#if defined(HYPRE_USING_GPU)
|
|
if (diagT)
|
|
{
|
|
// diagT is optional. Used only if it's present.
|
|
hypre_CSRMatrixMatvec(alpha, diagT, x_local, beta, y_local);
|
|
}
|
|
else
|
|
{
|
|
hypre_CSRMatrixMatvecT(alpha, diag, x_local, beta, y_local);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_ParCSRPersistentCommHandleStart(persistent_comm_handle, HYPRE_MEMORY_DEVICE, y_tmp_data);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
/* this is where we assume multivectors are 'column' storage */
|
|
comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 2, comm_pkg, HYPRE_MEMORY_DEVICE,
|
|
&y_tmp_data[jv * num_cols_offd],
|
|
HYPRE_MEMORY_DEVICE, y_buf_data[jv] );
|
|
}
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
#if !defined(HYPRE_USING_GPU)
|
|
/* overlapped local computation */
|
|
if (diagT)
|
|
{
|
|
// diagT is optional. Used only if it's present.
|
|
hypre_CSRMatrixMatvec(alpha, diagT, x_local, beta, y_local);
|
|
}
|
|
else
|
|
{
|
|
hypre_CSRMatrixMatvecT(alpha, diag, x_local, beta, y_local);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
/* nonblocking communication ends */
|
|
if (use_persistent_comm)
|
|
{
|
|
#ifdef HYPRE_USING_PERSISTENT_COMM
|
|
hypre_ParCSRPersistentCommHandleWait(persistent_comm_handle, HYPRE_MEMORY_DEVICE, y_buf_data[0]);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
hypre_ParCSRCommHandleDestroy(comm_handle[jv]);
|
|
comm_handle[jv] = NULL;
|
|
}
|
|
hypre_TFree(comm_handle, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
/* The assert is because the following loop only works for 'column'
|
|
storage of a multivector. This needs to be fixed to work more generally,
|
|
at least for 'row' storage. This in turn, means either change CommPkg so
|
|
num_sends is no.zones*no.vectors (not no.zones) or, less dangerously, put
|
|
a stride in the logic of CommHandleCreate (stride either from a new arg or
|
|
a new variable inside CommPkg). Or put the num_vector iteration inside
|
|
CommHandleCreate (perhaps a new multivector variant of it).
|
|
*/
|
|
|
|
hypre_assert( idxstride == 1 );
|
|
|
|
/* send_map_elmts on device */
|
|
hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
|
|
|
|
for (jv = 0; jv < num_vectors; ++jv)
|
|
{
|
|
HYPRE_Complex *recv_data = (HYPRE_Complex *) y_buf_data[jv];
|
|
HYPRE_Complex *locl_data = y_local_data + jv * vecstride;
|
|
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
/* unpack recv data on device */
|
|
hypre_ParCSRMatrixMatvecT_unpack( hypre_ParCSRMatrixNumCols(A), locl_data, recv_data, comm_pkg );
|
|
#elif defined(HYPRE_USING_DEVICE_OPENMP)
|
|
HYPRE_Int i, j;
|
|
/* unpack recv data on device */
|
|
for (i = 0; i < num_sends; i++)
|
|
{
|
|
HYPRE_Int *device_send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
|
|
HYPRE_Int start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
|
|
HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1);
|
|
#pragma omp target teams distribute parallel for private(j) is_device_ptr(recv_data, locl_data, device_send_map_elmts)
|
|
for (j = start; j < end; j++)
|
|
{
|
|
locl_data[device_send_map_elmts[j]] += recv_data[j];
|
|
}
|
|
}
|
|
#else
|
|
HYPRE_Int i;
|
|
/* unpack recv data on host, TODO OMP? */
|
|
for (i = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
|
|
i < hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
|
|
i ++)
|
|
{
|
|
locl_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i)] += recv_data[i];
|
|
}
|
|
#endif
|
|
}
|
|
|
|
hypre_SeqVectorDestroy(y_tmp); y_tmp = NULL;
|
|
|
|
if (!use_persistent_comm)
|
|
{
|
|
for ( jv = 0; jv < num_vectors; ++jv )
|
|
{
|
|
#if defined(HYPRE_USING_GPU)
|
|
if (jv == 0)
|
|
{
|
|
continue;
|
|
}
|
|
#endif
|
|
hypre_TFree(y_buf_data[jv], HYPRE_MEMORY_DEVICE);
|
|
}
|
|
hypre_TFree(y_buf_data, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
#if defined(HYPRE_USING_GPU)
|
|
hypre_SetSyncCudaCompute(sync_stream);
|
|
hypre_SyncComputeStream(hypre_handle());
|
|
#endif
|
|
|
|
#ifdef HYPRE_PROFILE
|
|
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
|
|
#endif
|
|
|
|
HYPRE_ANNOTATE_FUNC_END;
|
|
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
hypre_GpuProfilingPopRange();
|
|
#endif
|
|
|
|
return ierr;
|
|
}
|
|
|
|
/*--------------------------------------------------------------------------
|
|
* hypre_ParCSRMatrixMatvec_FF
|
|
*--------------------------------------------------------------------------*/
|
|
HYPRE_Int
|
|
hypre_ParCSRMatrixMatvec_FF( HYPRE_Complex alpha,
|
|
hypre_ParCSRMatrix *A,
|
|
hypre_ParVector *x,
|
|
HYPRE_Complex beta,
|
|
hypre_ParVector *y,
|
|
HYPRE_Int *CF_marker,
|
|
HYPRE_Int fpt )
|
|
{
|
|
MPI_Comm comm = hypre_ParCSRMatrixComm(A);
|
|
hypre_ParCSRCommHandle *comm_handle;
|
|
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
|
|
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
|
|
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
|
|
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
|
|
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
|
|
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
|
|
|
|
hypre_Vector *x_tmp;
|
|
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
|
|
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
|
|
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
|
|
HYPRE_Int ierr = 0;
|
|
HYPRE_Int num_sends, i, j, index, start, num_procs;
|
|
HYPRE_Int *int_buf_data = NULL;
|
|
HYPRE_Int *CF_marker_offd = NULL;
|
|
|
|
HYPRE_Complex *x_tmp_data = NULL;
|
|
HYPRE_Complex *x_buf_data = NULL;
|
|
HYPRE_Complex *x_local_data = hypre_VectorData(x_local);
|
|
/*---------------------------------------------------------------------
|
|
* Check for size compatibility. ParMatvec returns ierr = 11 if
|
|
* length of X doesn't equal the number of columns of A,
|
|
* ierr = 12 if the length of Y doesn't equal the number of rows
|
|
* of A, and ierr = 13 if both are true.
|
|
*
|
|
* Because temporary vectors are often used in ParMatvec, none of
|
|
* these conditions terminates processing, and the ierr flag
|
|
* is informational only.
|
|
*--------------------------------------------------------------------*/
|
|
|
|
hypre_MPI_Comm_size(comm, &num_procs);
|
|
|
|
if (num_cols != x_size)
|
|
{
|
|
ierr = 11;
|
|
}
|
|
|
|
if (num_rows != y_size)
|
|
{
|
|
ierr = 12;
|
|
}
|
|
|
|
if (num_cols != x_size && num_rows != y_size)
|
|
{
|
|
ierr = 13;
|
|
}
|
|
|
|
if (num_procs > 1)
|
|
{
|
|
if (num_cols_offd)
|
|
{
|
|
x_tmp = hypre_SeqVectorCreate( num_cols_offd );
|
|
hypre_SeqVectorInitialize(x_tmp);
|
|
x_tmp_data = hypre_VectorData(x_tmp);
|
|
}
|
|
|
|
/*---------------------------------------------------------------------
|
|
* If there exists no CommPkg for A, a CommPkg is generated using
|
|
* equally load balanced partitionings
|
|
*--------------------------------------------------------------------*/
|
|
if (!comm_pkg)
|
|
{
|
|
hypre_MatvecCommPkgCreate(A);
|
|
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
|
|
}
|
|
|
|
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
|
|
if (num_sends)
|
|
x_buf_data = hypre_CTAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart
|
|
(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
|
|
|
|
index = 0;
|
|
for (i = 0; i < num_sends; i++)
|
|
{
|
|
start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
|
|
for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1); j++)
|
|
x_buf_data[index++]
|
|
= x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
|
|
}
|
|
comm_handle =
|
|
hypre_ParCSRCommHandleCreate ( 1, comm_pkg, x_buf_data, x_tmp_data );
|
|
}
|
|
hypre_CSRMatrixMatvec_FF( alpha, diag, x_local, beta, y_local, CF_marker,
|
|
CF_marker, fpt);
|
|
|
|
if (num_procs > 1)
|
|
{
|
|
hypre_ParCSRCommHandleDestroy(comm_handle);
|
|
comm_handle = NULL;
|
|
|
|
if (num_sends)
|
|
int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart
|
|
(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
|
|
if (num_cols_offd) { CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
|
|
index = 0;
|
|
for (i = 0; i < num_sends; i++)
|
|
{
|
|
start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
|
|
for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1); j++)
|
|
int_buf_data[index++]
|
|
= CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
|
|
}
|
|
comm_handle =
|
|
hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data, CF_marker_offd );
|
|
|
|
hypre_ParCSRCommHandleDestroy(comm_handle);
|
|
comm_handle = NULL;
|
|
|
|
if (num_cols_offd) hypre_CSRMatrixMatvec_FF( alpha, offd, x_tmp, 1.0, y_local,
|
|
CF_marker, CF_marker_offd, fpt);
|
|
|
|
hypre_SeqVectorDestroy(x_tmp);
|
|
x_tmp = NULL;
|
|
hypre_TFree(x_buf_data, HYPRE_MEMORY_HOST);
|
|
hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
|
|
hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
|
|
}
|
|
|
|
return ierr;
|
|
}
|
|
|
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
|
HYPRE_Int
|
|
hypre_ParCSRMatrixMatvecT_unpack( HYPRE_Int ncols,
|
|
HYPRE_Complex *locl_data,
|
|
HYPRE_Complex *recv_data,
|
|
hypre_ParCSRCommPkg *comm_pkg )
|
|
{
|
|
HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
|
|
HYPRE_Int num_elemt = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
|
|
|
|
if (num_elemt == 0)
|
|
{
|
|
return hypre_error_flag;
|
|
}
|
|
|
|
if (!hypre_ParCSRCommPkgMatrixE(comm_pkg))
|
|
{
|
|
hypre_ParCSRCommPkgCreateMatrixE(comm_pkg, ncols);
|
|
}
|
|
|
|
hypre_CSRMatrix *E = hypre_ParCSRCommPkgMatrixE(comm_pkg);
|
|
hypre_Vector vec_x, vec_y;
|
|
hypre_VectorData(&vec_x) = recv_data;
|
|
hypre_VectorSize(&vec_x) = num_elemt;
|
|
hypre_VectorData(&vec_y) = locl_data;
|
|
|
|
hypre_CSRMatrixSpMVDevice(0, 1.0, E, &vec_x, 1.0, &vec_y, 0);
|
|
|
|
return hypre_error_flag;
|
|
}
|
|
#endif
|
|
|