hypre/parcsr_mv/communicationT.c

556 lines
23 KiB
C

/*BHEADER**********************************************************************
* Copyright (c) 2007, Lawrence Livermore National Security, LLC.
* Produced at the Lawrence Livermore National Laboratory.
* Written by the HYPRE team. UCRL-CODE-222953.
* All rights reserved.
*
* This file is part of HYPRE (see http://www.llnl.gov/CASC/hypre/).
* Please see the COPYRIGHT_and_LICENSE file for the copyright notice,
* disclaimer, contact information and the GNU Lesser General Public License.
*
* HYPRE is free software; you can redistribute it and/or modify it under the
* terms of the GNU General Public License (as published by the Free Software
* Foundation) version 2.1 dated February 1999.
*
* HYPRE is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General
* Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* $Revision$
***********************************************************************EHEADER*/
#include "headers.h"
void RowsWithColumn_original
( int * rowmin, int * rowmax, int column, hypre_ParCSRMatrix * A )
/* Finds rows of A which have a nonzero at the given (global) column number.
Sets rowmin to the minimum (local) row number of such rows, and rowmax
to the max. If there are no such rows, will return rowmax<0<=rowmin */
{
hypre_CSRMatrix * diag = hypre_ParCSRMatrixDiag(A);
hypre_CSRMatrix * offd = hypre_ParCSRMatrixOffd(A);
int * mat_i, * mat_j;
int i, j, num_rows;
int firstColDiag;
int * colMapOffd;
mat_i = hypre_CSRMatrixI(diag);
mat_j = hypre_CSRMatrixJ(diag);
num_rows = hypre_CSRMatrixNumRows(diag);
firstColDiag = hypre_ParCSRMatrixFirstColDiag(A);
*rowmin = num_rows;
*rowmax = -1;
for ( i=0; i<num_rows; ++i ) {
/* global number: row = i + firstRowIndex;*/
for ( j=mat_i[i]; j<mat_i[i+1]; ++j ) {
if ( mat_j[j]+firstColDiag==column ) {
/* row i (local row number) has column mat_j[j] (local column number) */
*rowmin = i<*rowmin ? i : *rowmin;
*rowmax = i>*rowmax ? i : *rowmax;
break;
}
}
}
mat_i = hypre_CSRMatrixI(offd);
mat_j = hypre_CSRMatrixJ(offd);
num_rows = hypre_CSRMatrixNumRows(offd);
colMapOffd = hypre_ParCSRMatrixColMapOffd(A);
for ( i=0; i<num_rows; ++i ) {
/* global number: row = i + firstRowIndex;*/
for ( j=mat_i[i]; j<mat_i[i+1]; ++j ) {
if ( colMapOffd[ mat_j[j] ]==column ) {
/* row i (local row number) has column mat_j[j] (local column number) */
*rowmin = i<*rowmin ? i : *rowmin;
*rowmax = i>*rowmax ? i : *rowmax;
break;
}
}
}
/* global col no.: mat_j[j]+hypre_ParCSRMatrixFirstColDiag(A)
or hypre_ParCSRMatrixColMapOffd(A)[ mat_j[j] ]
global row no.: i + hypre_ParCSRMatrixFirstRowIndex(A)
*/
}
void RowsWithColumn
( int * rowmin, int * rowmax, int column,
int num_rows_diag, int firstColDiag, int * colMapOffd,
int * mat_i_diag, int * mat_j_diag, int * mat_i_offd, int * mat_j_offd )
/* Finds rows of A which have a nonzero at the given (global) column number.
Sets rowmin to the minimum (local) row number of such rows, and rowmax
to the max. If there are no such rows, will return rowmax<0<=rowmin
The matrix A, normally a hypre_ParCSRMatrix or hypre_ParCSRBooleanMatrix,
is specified by:
num_rows_diag, (number of rows in diag, assumed to be same in offd)
firstColDiag, colMapOffd (to map CSR-type matrix columns to ParCSR-type columns
mat_i_diag, mat_j_diag: indices in the hypre_CSRMatrix or hypre_CSRBooleanMatrix for
diag block of A
mat_i_offd, mat_j_offd: indices in the hypre_CSRMatrix or hypre_CSRBooleanMatrix for
offd block of A
*/
{
int i, j;
*rowmin = num_rows_diag;
*rowmax = -1;
for ( i=0; i<num_rows_diag; ++i ) {
/* global number: row = i + firstRowIndex;*/
for ( j=mat_i_diag[i]; j<mat_i_diag[i+1]; ++j ) {
if ( mat_j_diag[j]+firstColDiag==column ) {
/* row i (local row number) has column mat_j[j] (local column number) */
*rowmin = i<*rowmin ? i : *rowmin;
*rowmax = i>*rowmax ? i : *rowmax;
break;
}
}
}
for ( i=0; i<num_rows_diag; ++i ) {
/* global number: row = i + firstRowIndex;*/
for ( j=mat_i_offd[i]; j<mat_i_offd[i+1]; ++j ) {
if ( colMapOffd[ mat_j_offd[j] ]==column ) {
/* row i (local row number) has column mat_j[j] (local column number) */
*rowmin = i<*rowmin ? i : *rowmin;
*rowmax = i>*rowmax ? i : *rowmax;
break;
}
}
}
/* global col no.: mat_j[j]+hypre_ParCSRMatrixFirstColDiag(A)
or hypre_ParCSRMatrixColMapOffd(A)[ mat_j[j] ]
global row no.: i + hypre_ParCSRMatrixFirstRowIndex(A)
*/
}
/* hypre_MatTCommPkgCreate_core does all the communications and computations for
hypre_MatTCommPkgCreate ( hypre_ParCSRMatrix *A)
and hypre_BoolMatTCommPkgCreate ( hypre_ParCSRBooleanMatrix *A)
To support both data types, it has hardly any data structures other than int*.
*/
void
hypre_MatTCommPkgCreate_core (
/* input args: */
MPI_Comm comm, int * col_map_offd, int first_col_diag, int * col_starts,
int num_rows_diag, int num_cols_diag, int num_cols_offd, int * row_starts,
int firstColDiag, int * colMapOffd,
int * mat_i_diag, int * mat_j_diag, int * mat_i_offd, int * mat_j_offd,
int data, /* = 1 for a matrix with floating-point data, =0 for Boolean matrix */
/* pointers to output args: */
int * p_num_recvs, int ** p_recv_procs, int ** p_recv_vec_starts,
int * p_num_sends, int ** p_send_procs, int ** p_send_map_starts,
int ** p_send_map_elmts
)
{
int num_sends;
int *send_procs;
int *send_map_starts;
int *send_map_elmts;
int num_recvs;
int *recv_procs;
int *recv_vec_starts;
int i, j, j2, k, ir, rowmin, rowmax;
int *tmp, *recv_buf, *displs, *info, *send_buf, *all_num_sends3;
int num_procs, my_id, num_elmts;
int local_info, index, index2;
int pmatch, col, kc, p;
int * recv_sz_buf;
int * row_marker;
MPI_Comm_size(comm, &num_procs);
MPI_Comm_rank(comm, &my_id);
info = hypre_CTAlloc(int, num_procs);
/* ----------------------------------------------------------------------
* determine which processors to receive from (set proc_mark) and num_recvs,
* at the end of the loop proc_mark[i] contains the number of elements to be
* received from Proc. i
*
* - For A*b or A*B: for each off-diagonal column i of A, you want to identify
* the processor which has the corresponding element i of b (row i of B)
* (columns in the local diagonal block, just multiply local rows of B).
* You do it by finding the processor which has that column of A in its
* _diagonal_ block - assuming b or B is distributed the same, which I believe
* is evenly among processors, by row. There is a unique solution because
* the diag/offd blocking is defined by which processor owns which rows of A.
*
* - For A*A^T: A^T is not distributed by rows as B or any 'normal' matrix is.
* For each off-diagonal row,column k,i element of A, you want to identify
* the processors which have the corresponding row,column i,j elements of A^T
* i.e., row,column j,i elements of A (all i,j,k for which these entries are
* nonzero, row k of A lives on this processor, and row j of A lives on
* a different processor). So, given a column i in the local A-offd or A-diag,
* we want to find all the processors which have column i, in diag or offd
* blocks. Unlike the A*B case, I don't think you can eliminate looking at
* any class of blocks.
* ---------------------------------------------------------------------*/
/* The algorithm for A*B was:
For each of my columns i (in offd block), use known information on data
distribution of columns in _diagonal_ blocks to find the processor p which
owns row i. (Note that for i in diag block, I own the row, nothing to do.)
Count up such i's for each processor in proc_mark. Construct a data
structure, recv_buf, made by appending a structure tmp from each processor.
The data structure tmp looks like (p, no. of i's, i1, i2,...) (p=0,...) .
There are two communication steps: gather size information (local_info) from
all processors (into info), then gather the data (tmp) from all processors
(into recv_buf). Then you go through recv_buf. For each (sink) processor p
you search for for the appearance of my (source) processor number
(most of recv_buf pertains to other processors and is ignored).
When you find the appropriate section, pull out the i's, count them and
save them, in send_map_elmts, and save p in send_procs and index information
in send_map_starts.
*/
/* The algorithm for A*A^T:
[ Originally I had planned to figure out approximately which processors
had the information (for A*B it could be done exactly) to save on
communication. But even for A*B where the data owner is known, all data is
sent to all processors, so that's not worth worrying about on the first cut.
One consequence is that proc_mark is not needed.]
Construct a data structure, recv_buf, made by appending a structure tmp for
each processor. It simply consists of (no. of i's, i1, i2,...) where i is
the global number of a column in the offd block. There are still two
communication steps: gather size information (local_info) from all processors
(into info), then gather the data (tmp) from all processors (into recv_buf).
Then you go through recv_buf. For each (sink) processor p you go through
all its column numbers in recv_buf. Check each one for whether you have
data in that column. If so, put in in send_map_elmts, p in send_procs,
and update the index information in send_map_starts. Note that these
arrays don't mean quite the same thing as for A*B.
*/
num_recvs=num_procs-1;
local_info = num_procs + num_cols_offd + num_cols_diag;
MPI_Allgather(&local_info, 1, MPI_INT, info, 1, MPI_INT, comm);
/* ----------------------------------------------------------------------
* generate information to be send: tmp contains for each recv_proc:
* {deleted: id of recv_procs}, number of elements to be received for this processor,
* indices of elements (in this order)
* ---------------------------------------------------------------------*/
displs = hypre_CTAlloc(int, num_procs+1);
displs[0] = 0;
for (i=1; i < num_procs+1; i++)
displs[i] = displs[i-1]+info[i-1];
recv_buf = hypre_CTAlloc(int, displs[num_procs]);
tmp = hypre_CTAlloc(int, local_info);
j = 0;
for (i=0; i < num_procs; i++) {
j2 = j++;
tmp[j2] = 0;
for (k=0; k < num_cols_offd; k++)
if (col_map_offd[k] >= col_starts[i] &&
col_map_offd[k] < col_starts[i+1]) {
tmp[j++] = col_map_offd[k];
++(tmp[j2]);
};
for (k=0; k < num_cols_diag; k++)
if ( k+first_col_diag >= col_starts[i] &&
k+first_col_diag < col_starts[i+1] ) {
tmp[j++] = k + first_col_diag;
++(tmp[j2]);
}
}
MPI_Allgatherv(tmp,local_info,MPI_INT,recv_buf,info,displs,MPI_INT,comm);
/* ----------------------------------------------------------------------
* determine send_procs and actual elements to be send (in send_map_elmts)
* and send_map_starts whose i-th entry points to the beginning of the
* elements to be send to proc. i
* ---------------------------------------------------------------------*/
/* Meanings of arrays being set here, more verbosely stated:
send_procs: processors p to send to
send_map_starts: for each p, gives range of indices in send_map_elmts;
send_map_elmts: Each element is a send_map_elmts[i], with i in a range given
by send_map_starts[p..p+1], for some p. This element is is the global
column number for a column in the offd block of p which is to be multiplied
by data from this processor.
For A*B, send_map_elmts[i] is therefore a row of B belonging to this
processor, to be sent to p. For A*A^T, send_map_elmts[i] is a row of A
belonging to this processor, to be sent to p; this row was selected
because it has a nonzero on a _column_ needed by p.
*/
num_sends = num_procs; /* may turn out to be less, but we can't know yet */
num_elmts = (num_procs-1)*num_rows_diag;
/* ... a crude upper bound; should try to do better even if more comm required */
send_procs = hypre_CTAlloc(int, num_sends);
send_map_starts = hypre_CTAlloc(int, num_sends+1);
send_map_elmts = hypre_CTAlloc(int, num_elmts);
row_marker = hypre_CTAlloc(int,num_rows_diag);
index = 0;
index2 = 0;
send_map_starts[0] = 0;
for (i=0; i < num_procs; i++) {
send_map_starts[index+1] = send_map_starts[index];
j = displs[i];
pmatch = 0;
for ( ir=0; ir<num_rows_diag; ++ir ) row_marker[ir] = 0;
while ( j < displs[i+1])
{
num_elmts = recv_buf[j++]; /* no. of columns proc. i wants */
for ( k=0; k<num_elmts; k++ ) {
col = recv_buf[j++]; /* a global column no. at proc. i */
for ( kc=0; kc<num_cols_offd; kc++ ) {
if ( col_map_offd[kc]==col && i!=my_id ) {
/* this processor has the same column as proc. i (but is different) */
pmatch = 1;
send_procs[index] = i;
/* this would be right if we could send columns, but we can't ...
offset = first_col_diag;
++send_map_starts[index+1];
send_map_elmts[index2++] = col - offset; */
/* Plan to send all of my rows which use this column... */
RowsWithColumn( &rowmin, &rowmax, col,
num_rows_diag,
firstColDiag, colMapOffd,
mat_i_diag, mat_j_diag, mat_i_offd, mat_j_offd
);
for ( ir=rowmin; ir<=rowmax; ++ir ) {
if ( row_marker[ir]==0 ) {
row_marker[ir] = 1;
++send_map_starts[index+1];
send_map_elmts[index2++] = ir;
}
}
}
}
/* alternative way of doing the following for-loop:
for ( kc=0; kc<num_cols_diag; kc++ ) {
if ( kc+first_col_diag==col && i!=my_id ) {
/ * this processor has the same column as proc. i (but is different) * /
pmatch = 1;
/ * this would be right if we could send columns, but we can't ... >>> * /
send_procs[index] = i;
++send_map_starts[index+1];
send_map_elmts[index2++] = col - offset;
/ * Plan to send all of my rows which use this column... * /
/ * NOT DONE * /
}
}
*/
for ( kc=row_starts[my_id]; kc<row_starts[my_id+1]; kc++ ) {
if ( kc==col && i!=my_id ) {
/* this processor has the same column as proc. i (but is different) */
pmatch = 1;
send_procs[index] = i;
/* this would be right if we could send columns, but we can't ... >>>
++send_map_starts[index+1];
send_map_elmts[index2++] = col - offset;*/
/* Plan to send all of my rows which use this column... */
RowsWithColumn( &rowmin, &rowmax, col,
num_rows_diag,
firstColDiag, colMapOffd,
mat_i_diag, mat_j_diag, mat_i_offd, mat_j_offd
);
for ( ir=rowmin; ir<=rowmax; ++ir ) {
if ( row_marker[ir]==0 ) {
row_marker[ir] = 1;
++send_map_starts[index+1];
send_map_elmts[index2++] = ir;
}
}
}
}
}
}
if ( pmatch ) index++;
}
num_sends = index; /* no. of proc. rows will be sent to */
/* Compute receive arrays recv_procs, recv_vec_starts ... */
recv_procs = hypre_CTAlloc(int, num_recvs);
recv_vec_starts = hypre_CTAlloc(int, num_recvs+1);
j2 = 0;
for (i=0; i < num_procs; i++) {
if ( i!=my_id ) { recv_procs[j2] = i; j2++; };
};
/* Compute recv_vec_starts.
The real job is, for each processor p, to figure out how many rows
p will send to me (me=this processor). I now know how many (and which)
rows I will send to each p. Indeed, if send_procs[index]=p, then the
number is send_map_starts[index+1]-send_map_starts[index].
More communication is needed.
options:
MPI_Allgather of communication sizes. <--- my choice, for now
good: simple bad: send num_procs*num_sends data, only need num_procs
but: not that much data compared to previous communication
MPI_Allgatherv of communication sizes, only for pairs of procs. that communicate
good: less data than above bad: need extra commun. step to get recvcounts
MPI_ISend,MPI_IRecv of each size, separately between each pair of procs.
good: no excess data sent bad: lots of little messages
but: Allgather might be done the same under the hood
may be much slower than Allgather or may be a bit faster depending on
implementations
*/
send_buf = hypre_CTAlloc( int, 3*num_sends );
all_num_sends3 = hypre_CTAlloc( int, num_procs );
/* scatter-gather num_sends, to set up the size for the main comm. step */
i = 3*num_sends;
MPI_Allgather( &i, 1, MPI_INT, all_num_sends3, 1, MPI_INT, comm );
displs[0] = 0;
for ( p=0; p<num_procs; ++p ) {
displs[p+1] = displs[p] + all_num_sends3[p];
};
recv_sz_buf = hypre_CTAlloc( int, displs[num_procs] );
/* scatter-gather size of row info to send, and proc. to send to */
index = 0;
for ( i=0; i<num_sends; ++i ) {
send_buf[index++] = send_procs[i]; /* processor to send to */
send_buf[index++] = my_id;
send_buf[index++] = send_map_starts[i+1] - send_map_starts[i];
/* ... sizes of info to send */
};
MPI_Allgatherv( send_buf, 3*num_sends, MPI_INT,
recv_sz_buf, all_num_sends3, displs, MPI_INT, comm);
recv_vec_starts[0] = 0;
j2 = 0; j = 0;
for ( i=0; i<displs[num_procs]; i=i+3 ) {
j = i;
if ( recv_sz_buf[j++]==my_id ) {
recv_procs[j2] = recv_sz_buf[j++];
recv_vec_starts[j2+1] = recv_vec_starts[j2] + recv_sz_buf[j++];
j2++;
}
}
num_recvs = j2;
#if 0
printf("num_procs=%i send_map_starts (%i):",num_procs,num_sends+1);
for( i=0; i<=num_sends; ++i ) printf(" %i", send_map_starts[i] );
printf(" send_procs (%i):",num_sends);
for( i=0; i<num_sends; ++i ) printf(" %i", send_procs[i] );
printf("\n");
printf("my_id=%i num_sends=%i send_buf[0,1,2]=%i %i %i",
my_id, num_sends, send_buf[0], send_buf[1], send_buf[2] );
printf(" all_num_sends3[0,1]=%i %i\n", all_num_sends3[0], all_num_sends3[1] );
printf("my_id=%i rcv_sz_buf (%i):", my_id, displs[num_procs] );
for( i=0; i<displs[num_procs]; ++i ) printf(" %i", recv_sz_buf[i] );
printf("\n");
printf("my_id=%i recv_vec_starts (%i):",my_id,num_recvs+1);
for( i=0; i<=num_recvs; ++i ) printf(" %i", recv_vec_starts[i] );
printf(" recv_procs (%i):",num_recvs);
for( i=0; i<num_recvs; ++i ) printf(" %i", recv_procs[i] );
printf("\n");
printf("my_id=%i num_recvs=%i recv_sz_buf[0,1,2]=%i %i %i\n",
my_id, num_recvs, recv_sz_buf[0], recv_sz_buf[1], recv_sz_buf[2] );
#endif
hypre_TFree(send_buf);
hypre_TFree(all_num_sends3);
hypre_TFree(tmp);
hypre_TFree(recv_buf);
hypre_TFree(displs);
hypre_TFree(info);
hypre_TFree(recv_sz_buf);
hypre_TFree(row_marker);
/* finish up with the hand-coded call-by-reference... */
*p_num_recvs = num_recvs;
*p_recv_procs = recv_procs;
*p_recv_vec_starts = recv_vec_starts;
*p_num_sends = num_sends;
*p_send_procs = send_procs;
*p_send_map_starts = send_map_starts;
*p_send_map_elmts = send_map_elmts;
}
/* ----------------------------------------------------------------------
* hypre_MatTCommPkgCreate
* generates a special comm_pkg for A - for use in multiplying by its
* transpose, A * A^T
* if no row and/or column partitioning is given, the routine determines
* them with MPE_Decomp1d
* ---------------------------------------------------------------------*/
int
hypre_MatTCommPkgCreate ( hypre_ParCSRMatrix *A)
{
hypre_ParCSRCommPkg *comm_pkg;
MPI_Comm comm = hypre_ParCSRMatrixComm(A);
/* MPI_Datatype *recv_mpi_types;
MPI_Datatype *send_mpi_types;
*/
int num_sends;
int *send_procs;
int *send_map_starts;
int *send_map_elmts;
int num_recvs;
int *recv_procs;
int *recv_vec_starts;
int *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
int first_col_diag = hypre_ParCSRMatrixFirstColDiag(A);
int *col_starts = hypre_ParCSRMatrixColStarts(A);
int ierr = 0;
int num_rows_diag = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
int num_cols_diag = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixDiag(A));
int num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
int * row_starts = hypre_ParCSRMatrixRowStarts(A);
hypre_MatTCommPkgCreate_core (
comm, col_map_offd, first_col_diag, col_starts,
num_rows_diag, num_cols_diag, num_cols_offd, row_starts,
hypre_ParCSRMatrixFirstColDiag(A),
hypre_ParCSRMatrixColMapOffd(A),
hypre_CSRMatrixI( hypre_ParCSRMatrixDiag(A) ),
hypre_CSRMatrixJ( hypre_ParCSRMatrixDiag(A) ),
hypre_CSRMatrixI( hypre_ParCSRMatrixOffd(A) ),
hypre_CSRMatrixJ( hypre_ParCSRMatrixOffd(A) ),
1,
&num_recvs, &recv_procs, &recv_vec_starts,
&num_sends, &send_procs, &send_map_starts,
&send_map_elmts
);
comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1);
hypre_ParCSRCommPkgComm(comm_pkg) = comm;
hypre_ParCSRCommPkgNumRecvs(comm_pkg) = num_recvs;
hypre_ParCSRCommPkgRecvProcs(comm_pkg) = recv_procs;
hypre_ParCSRCommPkgRecvVecStarts(comm_pkg) = recv_vec_starts;
hypre_ParCSRCommPkgNumSends(comm_pkg) = num_sends;
hypre_ParCSRCommPkgSendProcs(comm_pkg) = send_procs;
hypre_ParCSRCommPkgSendMapStarts(comm_pkg) = send_map_starts;
hypre_ParCSRCommPkgSendMapElmts(comm_pkg) = send_map_elmts;
hypre_ParCSRMatrixCommPkgT(A) = comm_pkg;
return ierr;
}