hypre/src/parcsr_mv/par_csr_matvec.c
2022-06-07 15:28:11 -07:00

955 lines
32 KiB
C

/******************************************************************************
* Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
* HYPRE Project Developers. See the top-level COPYRIGHT file for details.
*
* SPDX-License-Identifier: (Apache-2.0 OR MIT)
******************************************************************************/
/******************************************************************************
*
* Matvec functions for hypre_CSRMatrix class.
*
*****************************************************************************/
#include "_hypre_onedpl.hpp"
#include "_hypre_parcsr_mv.h"
#include "_hypre_utilities.hpp" //RL: TODO par_csr_matvec_device.c, include cuda there
/*--------------------------------------------------------------------------
* hypre_ParCSRMatrixMatvec
*--------------------------------------------------------------------------*/
// y = alpha*A*x + beta*b
HYPRE_Int
hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha,
hypre_ParCSRMatrix *A,
hypre_ParVector *x,
HYPRE_Complex beta,
hypre_ParVector *b,
hypre_ParVector *y )
{
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_GpuProfilingPushRange("Matvec");
#endif
hypre_ParCSRCommHandle **comm_handle;
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
hypre_Vector *b_local = hypre_ParVectorLocalVector(b);
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
hypre_Vector *x_tmp;
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
HYPRE_BigInt b_size = hypre_ParVectorGlobalSize(b);
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
HYPRE_Int num_vectors = hypre_VectorNumVectors(x_local);
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
HYPRE_Int ierr = 0;
HYPRE_Int num_sends, jv;
HYPRE_Int vecstride = hypre_VectorVectorStride( x_local );
HYPRE_Int idxstride = hypre_VectorIndexStride( x_local );
HYPRE_Complex *x_tmp_data, **x_buf_data;
HYPRE_Complex *x_local_data = hypre_VectorData(x_local);
#if defined(HYPRE_USING_GPU)
HYPRE_Int sync_stream;
hypre_GetSyncCudaCompute(&sync_stream);
hypre_SetSyncCudaCompute(0);
#endif
HYPRE_ANNOTATE_FUNC_BEGIN;
/*---------------------------------------------------------------------
* Check for size compatibility. ParMatvec returns ierr = 11 if
* length of X doesn't equal the number of columns of A,
* ierr = 12 if the length of Y doesn't equal the number of rows
* of A, and ierr = 13 if both are true.
*
* Because temporary vectors are often used in ParMatvec, none of
* these conditions terminates processing, and the ierr flag
* is informational only.
*--------------------------------------------------------------------*/
hypre_assert( idxstride > 0 );
if (num_cols != x_size)
{
ierr = 11;
}
if (num_rows != y_size || num_rows != b_size)
{
ierr = 12;
}
if (num_cols != x_size && (num_rows != y_size || num_rows != b_size))
{
ierr = 13;
}
hypre_assert( hypre_VectorNumVectors(b_local) == num_vectors );
hypre_assert( hypre_VectorNumVectors(y_local) == num_vectors );
if ( num_vectors == 1 )
{
x_tmp = hypre_SeqVectorCreate( num_cols_offd );
}
else
{
hypre_assert( num_vectors > 1 );
x_tmp = hypre_SeqMultiVectorCreate( num_cols_offd, num_vectors );
}
/*---------------------------------------------------------------------
* If there exists no CommPkg for A, a CommPkg is generated using
* equally load balanced partitionings
*--------------------------------------------------------------------*/
if (!comm_pkg)
{
hypre_MatvecCommPkgCreate(A);
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
}
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg,
hypre_ParCSRCommPkgNumRecvs(comm_pkg)) );
hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 );
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
#endif
HYPRE_Int use_persistent_comm = 0;
#ifdef HYPRE_USING_PERSISTENT_COMM
use_persistent_comm = num_vectors == 1;
// JSP TODO: we can use persistent communication for multi-vectors,
// but then we need different communication handles for different
// num_vectors.
hypre_ParCSRPersistentCommHandle *persistent_comm_handle;
#endif
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(1, comm_pkg);
#endif
}
else
{
comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle*, num_vectors, HYPRE_MEMORY_HOST);
}
/* x_tmp */
#if defined(HYPRE_USING_GPU)
/* for GPU and single vector, alloc persistent memory for x_tmp (in comm_pkg) and reuse */
if (num_vectors == 1)
{
if (!hypre_ParCSRCommPkgTmpData(comm_pkg))
{
#if 1
hypre_ParCSRCommPkgTmpData(comm_pkg) = hypre_TAlloc(HYPRE_Complex, num_cols_offd,
HYPRE_MEMORY_DEVICE);
#else
hypre_ParCSRCommPkgTmpData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex, num_cols_offd,
hypre_MEMORY_DEVICE);
#endif
}
hypre_VectorData(x_tmp) = hypre_ParCSRCommPkgTmpData(comm_pkg);
hypre_SeqVectorSetDataOwner(x_tmp, 0);
}
#else
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_VectorData(x_tmp) = (HYPRE_Complex *) hypre_ParCSRCommHandleRecvDataBuffer(
persistent_comm_handle);
hypre_SeqVectorSetDataOwner(x_tmp, 0);
#endif
}
#endif
hypre_SeqVectorInitialize_v2(x_tmp, HYPRE_MEMORY_DEVICE);
x_tmp_data = hypre_VectorData(x_tmp);
/* x_buff_data */
x_buf_data = hypre_CTAlloc(HYPRE_Complex*, num_vectors, HYPRE_MEMORY_HOST);
for (jv = 0; jv < num_vectors; ++jv)
{
#if defined(HYPRE_USING_GPU)
if (jv == 0)
{
if (!hypre_ParCSRCommPkgBufData(comm_pkg))
{
#if 1
hypre_ParCSRCommPkgBufData(comm_pkg) = hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
HYPRE_MEMORY_DEVICE);
#else
hypre_ParCSRCommPkgBufData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
hypre_MEMORY_DEVICE);
#endif
}
x_buf_data[0] = hypre_ParCSRCommPkgBufData(comm_pkg);
continue;
}
#endif
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
x_buf_data[0] = (HYPRE_Complex *) hypre_ParCSRCommHandleSendDataBuffer(persistent_comm_handle);
continue;
#endif
}
x_buf_data[jv] = hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
HYPRE_MEMORY_DEVICE);
}
/* The assert is because the following loop only works for 'column'
storage of a multivector. This needs to be fixed to work more generally,
at least for 'row' storage. This in turn, means either change CommPkg so
num_sends is no.zones*no.vectors (not no.zones) or, less dangerously, put
a stride in the logic of CommHandleCreate (stride either from a new arg or
a new variable inside CommPkg). Or put the num_vector iteration inside
CommHandleCreate (perhaps a new multivector variant of it).
*/
hypre_assert( idxstride == 1 );
//hypre_SeqVectorPrefetch(x_local, HYPRE_MEMORY_DEVICE);
/* send_map_elmts on device */
hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
for (jv = 0; jv < num_vectors; ++jv)
{
HYPRE_Complex *send_data = (HYPRE_Complex *) x_buf_data[jv];
HYPRE_Complex *locl_data = x_local_data + jv * vecstride;
/* if on device, no need to Sync: send_data is on device memory */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
/* pack send data on device */
HYPRE_THRUST_CALL( gather,
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
locl_data,
send_data );
#elif defined(HYPRE_USING_SYCL)
auto permuted_source = oneapi::dpl::make_permutation_iterator(locl_data,
hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg));
HYPRE_ONEDPL_CALL( std::copy,
permuted_source,
permuted_source + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
send_data );
#elif defined(HYPRE_USING_DEVICE_OPENMP)
/* pack send data on device */
HYPRE_Int i;
HYPRE_Int *device_send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
HYPRE_Int start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
#pragma omp target teams distribute parallel for private(i) is_device_ptr(send_data, locl_data, device_send_map_elmts)
for (i = start; i < end; i++)
{
send_data[i] = locl_data[device_send_map_elmts[i]];
}
#else
HYPRE_Int i;
/* pack send data on host */
#if defined(HYPRE_USING_OPENMP)
#pragma omp parallel for HYPRE_SMP_SCHEDULE
#endif
for (i = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
i < hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
i ++)
{
send_data[i] = locl_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i)];
}
#endif
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
#endif
#if defined(HYPRE_USING_GPU) && defined(HYPRE_WITH_GPU_AWARE_MPI) && THRUST_CALL_BLOCKING == 0
/* RL: make sure x_buf_data is ready before issuing GPU-GPU MPI */
hypre_ForceSyncComputeStream(hypre_handle());
#endif
/* when using GPUs, start local matvec first in order to overlap with communication */
#if defined(HYPRE_USING_GPU)
hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0 );
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
#endif
/* nonblocking communication starts */
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_ParCSRPersistentCommHandleStart(persistent_comm_handle, HYPRE_MEMORY_DEVICE, x_buf_data[0]);
#endif
}
else
{
for ( jv = 0; jv < num_vectors; ++jv )
{
comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 1, comm_pkg, HYPRE_MEMORY_DEVICE, x_buf_data[jv],
HYPRE_MEMORY_DEVICE, &x_tmp_data[jv * num_cols_offd] );
}
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
#endif
#if !defined(HYPRE_USING_GPU)
/* overlapped local computation */
hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0 );
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
#endif
/* nonblocking communication ends */
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_ParCSRPersistentCommHandleWait(persistent_comm_handle, HYPRE_MEMORY_DEVICE, x_tmp_data);
#endif
}
else
{
for ( jv = 0; jv < num_vectors; ++jv )
{
hypre_ParCSRCommHandleDestroy(comm_handle[jv]);
comm_handle[jv] = NULL;
}
hypre_TFree(comm_handle, HYPRE_MEMORY_HOST);
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
#endif
/* computation offd part */
if (num_cols_offd)
{
hypre_CSRMatrixMatvec( alpha, offd, x_tmp, 1.0, y_local );
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
#endif
hypre_SeqVectorDestroy(x_tmp); x_tmp = NULL;
if (!use_persistent_comm)
{
for ( jv = 0; jv < num_vectors; ++jv )
{
#if defined(HYPRE_USING_GPU)
if (jv == 0)
{
continue;
}
#endif
hypre_TFree(x_buf_data[jv], HYPRE_MEMORY_DEVICE);
}
hypre_TFree(x_buf_data, HYPRE_MEMORY_HOST);
}
#if defined(HYPRE_USING_GPU)
hypre_SetSyncCudaCompute(sync_stream);
hypre_SyncComputeStream(hypre_handle());
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
#endif
HYPRE_ANNOTATE_FUNC_END;
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_GpuProfilingPopRange();
#endif
return ierr;
}
HYPRE_Int
hypre_ParCSRMatrixMatvec( HYPRE_Complex alpha,
hypre_ParCSRMatrix *A,
hypre_ParVector *x,
HYPRE_Complex beta,
hypre_ParVector *y )
{
return hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A, x, beta, y, y);
}
/*--------------------------------------------------------------------------
* hypre_ParCSRMatrixMatvecT
*
* Performs y <- alpha * A^T * x + beta * y
*
*--------------------------------------------------------------------------*/
HYPRE_Int
hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha,
hypre_ParCSRMatrix *A,
hypre_ParVector *x,
HYPRE_Complex beta,
hypre_ParVector *y )
{
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_GpuProfilingPushRange("MatvecT");
#endif
hypre_ParCSRCommHandle **comm_handle;
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
hypre_CSRMatrix *diagT = hypre_ParCSRMatrixDiagT(A);
hypre_CSRMatrix *offdT = hypre_ParCSRMatrixOffdT(A);
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
hypre_Vector *y_tmp;
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
HYPRE_Int num_vectors = hypre_VectorNumVectors(y_local);
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
HYPRE_Int ierr = 0;
HYPRE_Int num_sends, jv;
HYPRE_Int vecstride = hypre_VectorVectorStride(y_local);
HYPRE_Int idxstride = hypre_VectorIndexStride(y_local);
HYPRE_Complex *y_tmp_data, **y_buf_data;
HYPRE_Complex *y_local_data = hypre_VectorData(y_local);
#if defined(HYPRE_USING_GPU)
HYPRE_Int sync_stream;
hypre_GetSyncCudaCompute(&sync_stream);
hypre_SetSyncCudaCompute(0);
#endif
HYPRE_ANNOTATE_FUNC_BEGIN;
/*---------------------------------------------------------------------
* Check for size compatibility. MatvecT returns ierr = 1 if
* length of X doesn't equal the number of rows of A,
* ierr = 2 if the length of Y doesn't equal the number of
* columns of A, and ierr = 3 if both are true.
*
* Because temporary vectors are often used in MatvecT, none of
* these conditions terminates processing, and the ierr flag
* is informational only.
*--------------------------------------------------------------------*/
if (num_rows != x_size)
{
ierr = 1;
}
if (num_cols != y_size)
{
ierr = 2;
}
if (num_rows != x_size && num_cols != y_size)
{
ierr = 3;
}
hypre_assert( hypre_VectorNumVectors(x_local) == num_vectors );
hypre_assert( hypre_VectorNumVectors(y_local) == num_vectors );
if ( num_vectors == 1 )
{
y_tmp = hypre_SeqVectorCreate(num_cols_offd);
}
else
{
hypre_assert( num_vectors > 1 );
y_tmp = hypre_SeqMultiVectorCreate(num_cols_offd, num_vectors);
}
/*---------------------------------------------------------------------
* If there exists no CommPkg for A, a CommPkg is generated using
* equally load balanced partitionings
*--------------------------------------------------------------------*/
if (!comm_pkg)
{
hypre_MatvecCommPkgCreate(A);
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
}
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg,
hypre_ParCSRCommPkgNumRecvs(comm_pkg)) );
hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 );
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
#endif
HYPRE_Int use_persistent_comm = 0;
#ifdef HYPRE_USING_PERSISTENT_COMM
use_persistent_comm = num_vectors == 1;
// JSP TODO: we can use persistent communication for multi-vectors,
// but then we need different communication handles for different
// num_vectors.
hypre_ParCSRPersistentCommHandle *persistent_comm_handle;
#endif
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(2, comm_pkg);
#endif
}
else
{
comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle*, num_vectors, HYPRE_MEMORY_HOST);
}
/* y_tmp */
#if defined(HYPRE_USING_GPU)
/* for GPU and single vector, alloc persistent memory for y_tmp (in comm_pkg) and reuse */
if (num_vectors == 1)
{
if (!hypre_ParCSRCommPkgTmpData(comm_pkg))
{
#if 1
hypre_ParCSRCommPkgTmpData(comm_pkg) = hypre_TAlloc(HYPRE_Complex, num_cols_offd,
HYPRE_MEMORY_DEVICE);
#else
hypre_ParCSRCommPkgTmpData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex, num_cols_offd,
hypre_MEMORY_DEVICE);
#endif
}
hypre_VectorData(y_tmp) = hypre_ParCSRCommPkgTmpData(comm_pkg);
hypre_SeqVectorSetDataOwner(y_tmp, 0);
}
#else
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_VectorData(y_tmp) = (HYPRE_Complex *) hypre_ParCSRCommHandleSendDataBuffer(
persistent_comm_handle);
hypre_SeqVectorSetDataOwner(y_tmp, 0);
#endif
}
#endif
hypre_SeqVectorInitialize_v2(y_tmp, HYPRE_MEMORY_DEVICE);
y_tmp_data = hypre_VectorData(y_tmp);
/* y_buf_data */
y_buf_data = hypre_CTAlloc(HYPRE_Complex*, num_vectors, HYPRE_MEMORY_HOST);
for (jv = 0; jv < num_vectors; ++jv)
{
#if defined(HYPRE_USING_GPU)
if (jv == 0)
{
if (!hypre_ParCSRCommPkgBufData(comm_pkg))
{
#if 1
hypre_ParCSRCommPkgBufData(comm_pkg) = hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
HYPRE_MEMORY_DEVICE);
#else
hypre_ParCSRCommPkgBufData(comm_pkg) = _hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
hypre_MEMORY_DEVICE);
#endif
}
y_buf_data[0] = hypre_ParCSRCommPkgBufData(comm_pkg);
continue;
}
#endif
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
y_buf_data[0] = (HYPRE_Complex *) hypre_ParCSRCommHandleRecvDataBuffer(persistent_comm_handle);
continue;
#endif
}
y_buf_data[jv] = hypre_TAlloc(HYPRE_Complex,
hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
HYPRE_MEMORY_DEVICE);
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
#endif
if (num_cols_offd)
{
if (offdT)
{
// offdT is optional. Used only if it's present
hypre_CSRMatrixMatvec(alpha, offdT, x_local, 0.0, y_tmp);
}
else
{
hypre_CSRMatrixMatvecT(alpha, offd, x_local, 0.0, y_tmp);
}
}
#if defined(HYPRE_USING_GPU) && defined(HYPRE_WITH_GPU_AWARE_MPI)
/* RL: make sure y_tmp is ready before issuing GPU-GPU MPI */
hypre_ForceSyncComputeStream(hypre_handle());
#endif
/* when using GPUs, start local matvec first in order to overlap with communication */
#if defined(HYPRE_USING_GPU)
if (diagT)
{
// diagT is optional. Used only if it's present.
hypre_CSRMatrixMatvec(alpha, diagT, x_local, beta, y_local);
}
else
{
hypre_CSRMatrixMatvecT(alpha, diag, x_local, beta, y_local);
}
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
#endif
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_ParCSRPersistentCommHandleStart(persistent_comm_handle, HYPRE_MEMORY_DEVICE, y_tmp_data);
#endif
}
else
{
for ( jv = 0; jv < num_vectors; ++jv )
{
/* this is where we assume multivectors are 'column' storage */
comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 2, comm_pkg, HYPRE_MEMORY_DEVICE,
&y_tmp_data[jv * num_cols_offd],
HYPRE_MEMORY_DEVICE, y_buf_data[jv] );
}
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
#endif
#if !defined(HYPRE_USING_GPU)
/* overlapped local computation */
if (diagT)
{
// diagT is optional. Used only if it's present.
hypre_CSRMatrixMatvec(alpha, diagT, x_local, beta, y_local);
}
else
{
hypre_CSRMatrixMatvecT(alpha, diag, x_local, beta, y_local);
}
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
#endif
/* nonblocking communication ends */
if (use_persistent_comm)
{
#ifdef HYPRE_USING_PERSISTENT_COMM
hypre_ParCSRPersistentCommHandleWait(persistent_comm_handle, HYPRE_MEMORY_DEVICE, y_buf_data[0]);
#endif
}
else
{
for ( jv = 0; jv < num_vectors; ++jv )
{
hypre_ParCSRCommHandleDestroy(comm_handle[jv]);
comm_handle[jv] = NULL;
}
hypre_TFree(comm_handle, HYPRE_MEMORY_HOST);
}
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
#endif
/* The assert is because the following loop only works for 'column'
storage of a multivector. This needs to be fixed to work more generally,
at least for 'row' storage. This in turn, means either change CommPkg so
num_sends is no.zones*no.vectors (not no.zones) or, less dangerously, put
a stride in the logic of CommHandleCreate (stride either from a new arg or
a new variable inside CommPkg). Or put the num_vector iteration inside
CommHandleCreate (perhaps a new multivector variant of it).
*/
hypre_assert( idxstride == 1 );
/* send_map_elmts on device */
hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
for (jv = 0; jv < num_vectors; ++jv)
{
HYPRE_Complex *recv_data = (HYPRE_Complex *) y_buf_data[jv];
HYPRE_Complex *locl_data = y_local_data + jv * vecstride;
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
/* unpack recv data on device */
hypre_ParCSRMatrixMatvecT_unpack( hypre_ParCSRMatrixNumCols(A), locl_data, recv_data, comm_pkg );
#elif defined(HYPRE_USING_DEVICE_OPENMP)
HYPRE_Int i, j;
/* unpack recv data on device */
for (i = 0; i < num_sends; i++)
{
HYPRE_Int *device_send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
HYPRE_Int start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1);
#pragma omp target teams distribute parallel for private(j) is_device_ptr(recv_data, locl_data, device_send_map_elmts)
for (j = start; j < end; j++)
{
locl_data[device_send_map_elmts[j]] += recv_data[j];
}
}
#else
HYPRE_Int i;
/* unpack recv data on host, TODO OMP? */
for (i = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
i < hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
i ++)
{
locl_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i)] += recv_data[i];
}
#endif
}
hypre_SeqVectorDestroy(y_tmp); y_tmp = NULL;
if (!use_persistent_comm)
{
for ( jv = 0; jv < num_vectors; ++jv )
{
#if defined(HYPRE_USING_GPU)
if (jv == 0)
{
continue;
}
#endif
hypre_TFree(y_buf_data[jv], HYPRE_MEMORY_DEVICE);
}
hypre_TFree(y_buf_data, HYPRE_MEMORY_HOST);
}
#if defined(HYPRE_USING_GPU)
hypre_SetSyncCudaCompute(sync_stream);
hypre_SyncComputeStream(hypre_handle());
#endif
#ifdef HYPRE_PROFILE
hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
#endif
HYPRE_ANNOTATE_FUNC_END;
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_GpuProfilingPopRange();
#endif
return ierr;
}
/*--------------------------------------------------------------------------
* hypre_ParCSRMatrixMatvec_FF
*--------------------------------------------------------------------------*/
HYPRE_Int
hypre_ParCSRMatrixMatvec_FF( HYPRE_Complex alpha,
hypre_ParCSRMatrix *A,
hypre_ParVector *x,
HYPRE_Complex beta,
hypre_ParVector *y,
HYPRE_Int *CF_marker,
HYPRE_Int fpt )
{
MPI_Comm comm = hypre_ParCSRMatrixComm(A);
hypre_ParCSRCommHandle *comm_handle;
hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
hypre_Vector *x_local = hypre_ParVectorLocalVector(x);
hypre_Vector *y_local = hypre_ParVectorLocalVector(y);
HYPRE_BigInt num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
HYPRE_BigInt num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
hypre_Vector *x_tmp;
HYPRE_BigInt x_size = hypre_ParVectorGlobalSize(x);
HYPRE_BigInt y_size = hypre_ParVectorGlobalSize(y);
HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd);
HYPRE_Int ierr = 0;
HYPRE_Int num_sends, i, j, index, start, num_procs;
HYPRE_Int *int_buf_data = NULL;
HYPRE_Int *CF_marker_offd = NULL;
HYPRE_Complex *x_tmp_data = NULL;
HYPRE_Complex *x_buf_data = NULL;
HYPRE_Complex *x_local_data = hypre_VectorData(x_local);
/*---------------------------------------------------------------------
* Check for size compatibility. ParMatvec returns ierr = 11 if
* length of X doesn't equal the number of columns of A,
* ierr = 12 if the length of Y doesn't equal the number of rows
* of A, and ierr = 13 if both are true.
*
* Because temporary vectors are often used in ParMatvec, none of
* these conditions terminates processing, and the ierr flag
* is informational only.
*--------------------------------------------------------------------*/
hypre_MPI_Comm_size(comm, &num_procs);
if (num_cols != x_size)
{
ierr = 11;
}
if (num_rows != y_size)
{
ierr = 12;
}
if (num_cols != x_size && num_rows != y_size)
{
ierr = 13;
}
if (num_procs > 1)
{
if (num_cols_offd)
{
x_tmp = hypre_SeqVectorCreate( num_cols_offd );
hypre_SeqVectorInitialize(x_tmp);
x_tmp_data = hypre_VectorData(x_tmp);
}
/*---------------------------------------------------------------------
* If there exists no CommPkg for A, a CommPkg is generated using
* equally load balanced partitionings
*--------------------------------------------------------------------*/
if (!comm_pkg)
{
hypre_MatvecCommPkgCreate(A);
comm_pkg = hypre_ParCSRMatrixCommPkg(A);
}
num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
if (num_sends)
x_buf_data = hypre_CTAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart
(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
index = 0;
for (i = 0; i < num_sends; i++)
{
start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1); j++)
x_buf_data[index++]
= x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
}
comm_handle =
hypre_ParCSRCommHandleCreate ( 1, comm_pkg, x_buf_data, x_tmp_data );
}
hypre_CSRMatrixMatvec_FF( alpha, diag, x_local, beta, y_local, CF_marker,
CF_marker, fpt);
if (num_procs > 1)
{
hypre_ParCSRCommHandleDestroy(comm_handle);
comm_handle = NULL;
if (num_sends)
int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart
(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
if (num_cols_offd) { CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
index = 0;
for (i = 0; i < num_sends; i++)
{
start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1); j++)
int_buf_data[index++]
= CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
}
comm_handle =
hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data, CF_marker_offd );
hypre_ParCSRCommHandleDestroy(comm_handle);
comm_handle = NULL;
if (num_cols_offd) hypre_CSRMatrixMatvec_FF( alpha, offd, x_tmp, 1.0, y_local,
CF_marker, CF_marker_offd, fpt);
hypre_SeqVectorDestroy(x_tmp);
x_tmp = NULL;
hypre_TFree(x_buf_data, HYPRE_MEMORY_HOST);
hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
}
return ierr;
}
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
HYPRE_Int
hypre_ParCSRMatrixMatvecT_unpack( HYPRE_Int ncols,
HYPRE_Complex *locl_data,
HYPRE_Complex *recv_data,
hypre_ParCSRCommPkg *comm_pkg )
{
HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
HYPRE_Int num_elemt = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
if (num_elemt == 0)
{
return hypre_error_flag;
}
if (!hypre_ParCSRCommPkgMatrixE(comm_pkg))
{
hypre_ParCSRCommPkgCreateMatrixE(comm_pkg, ncols);
}
hypre_CSRMatrix *E = hypre_ParCSRCommPkgMatrixE(comm_pkg);
hypre_Vector vec_x, vec_y;
hypre_VectorData(&vec_x) = recv_data;
hypre_VectorSize(&vec_x) = num_elemt;
hypre_VectorData(&vec_y) = locl_data;
hypre_CSRMatrixSpMVDevice(0, 1.0, E, &vec_x, 1.0, &vec_y, 0);
return hypre_error_flag;
}
#endif