hypre/distributed_linear_solvers/pilut/parilut.c

1271 lines
39 KiB
C

/*
* parilut.c
*
* This file implements the parallel phase of the ILUT algorithm
*
* Started 10/21/95
* George
*
* Taken over by MRGates 7/1/97.
*
* 7/8
* - added rowlen to rmat and verified
* 7/11
* - MPI and validated
* - fixed one more problem with rowlen (a rcolind--)
* 7/25
* - replaced George's reduction and second drop functions with my own
* The biggest difference is since I allow non-diagonal MIS sets then
* there is fill into L and the L must be processed in the correct
* order. Therefore I reverted to using the workspace as in serilut.
* (Note that this changes our answer so it is hard to verify.)
* - seperated the second drop function into four stages:
* 1) drop below rtol
* 2) seperate LU entries
* 3) update L for the row
* 4) form nrmat or DU for the row
* 7/28
* - finished the local factorization to reduce non-diagonal sets.
* This allows fillin, but the remote reduction still does not since
* all the necesary rows are not recieved yet (and thus it doesn't
* know what rows are actually in the MIS--otherwise we could just
* ignore MIS fillin for the moment).
* 7/29
* - send all factored rows, not just the requested ones (changes EraseMap also)
* - add the (maxnz+2) factor to the map, so for outside nodes, l is the exact index
* - removed inrval, instead using the new permutation to know what rows are MIS
* - removed map from the cinfo, since it was never refered to but globally
* 8/1
* - implemented split PE numbering. change VPE(x) to (x) to get back unsplit numbering.
* 8/6
* - Removed split PE numbering. After further testing, this does not seem to be an
* improvement, since it increases the number of levels. See par_split.c for that code.
*/
#include "./DistributedMatrixPilutSolver.h"
#include "ilu.h"
/*************************************************************************
* This function performs ILUT on the boundary nodes via MIS computation
**************************************************************************/
void ParILUT(DataDistType *ddist, FactorMatType *ldu,
ReduceMatType *rmat, int gmaxnz, double tol,
hypre_PilutSolverGlobals *globals )
{
int nmis, i, j, k, l, nlevel, nbnd;
CommInfoType cinfo;
int *perm, *iperm, *newiperm, *newperm;
ReduceMatType *rmats[2], nrmat;
#ifdef HYPRE_DEBUG
PrintLine("ILUT start", globals);
#endif
/* Initialize globals */
global_maxnz = gmaxnz;
nrows = ddist->ddist_nrows;
lnrows = ddist->ddist_lnrows;
firstrow = ddist->ddist_rowdist[mype];
lastrow = ddist->ddist_rowdist[mype+1];
perm = ldu->perm;
iperm = ldu->iperm;
ndone = rmat->rmat_ndone;
nbnd = ntogo = rmat->rmat_ntogo;
nleft = GlobalSESum(ntogo, pilut_comm);
rmats[0] = rmat;
rmats[1] = &nrmat;
/* Initialize and allocate structures, including global workspace */
ParINIT( &nrmat, &cinfo, ddist->ddist_rowdist, globals );
/* Copy the old perm into new perm vectors at the begining.
* After that this is done more or less automatically */
newperm = idx_malloc(lnrows, "ParILUT: newperm");
newiperm = idx_malloc(lnrows, "ParILUT: newiperm");
memcpy_idx(newperm, perm, lnrows);
memcpy_idx(newiperm, iperm, lnrows);
ldu->nnodes[0] = ndone;
nlevel = 0;
while( nleft > 0 ) {
/* printf("PE %d Nlevel: %d, Nleft: %d, (%d,%d)\n",
* mype, nlevel, nleft, ndone, ntogo); fflush(0); */
ComputeCommInfo(rmats[nlevel%2], &cinfo, ddist->ddist_rowdist, globals );
nmis = SelectSet(rmats[nlevel%2], &cinfo, perm, iperm, newperm, newiperm, globals );
FactorLocal(ldu, rmats[nlevel%2], rmats[(nlevel+1)%2], &cinfo,
perm, iperm, newperm, newiperm, nmis, tol, globals );
fflush(0); MPI_Barrier(pilut_comm);
SendFactoredRows(ldu, &cinfo, newperm, nmis, globals);
fflush(0); MPI_Barrier(pilut_comm);
ComputeRmat(ldu, rmats[nlevel%2], rmats[(nlevel+1)%2], &cinfo,
perm, iperm, newperm, newiperm, nmis, tol, globals);
EraseMap(&cinfo, newperm, nmis, globals);
/* copy the new portion of the permutation, and the entire inverse
* (since updates to the inverse are scattered throughout.) */
memcpy_idx(perm+ndone, newperm+ndone, ntogo );
memcpy_idx(iperm, newiperm, lnrows);
/* setup next rmat */
nlevel++;
ndone = rmats[nlevel%2]->rmat_ndone = ndone+nmis;
ntogo = rmats[nlevel%2]->rmat_ntogo = ntogo-nmis;
nleft = GlobalSESum(ntogo, pilut_comm);
if (nlevel > MAXNLEVEL)
errexit("Maximum number of levels exceeded!\n", globals);
ldu->nnodes[nlevel] = ndone;
}
ldu->nlevels = nlevel;
free_multi(jr, jw, lr, w, map,
nrmat.rmat_rnz, nrmat.rmat_rrowlen, nrmat.rmat_rcolind,
nrmat.rmat_rvalues,
cinfo.gatherbuf, cinfo.rrowind, cinfo.rnbrind, cinfo.rnbrptr,
cinfo.snbrind, cinfo.srowind, cinfo.snbrptr,
cinfo.incolind, cinfo.invalues,
newperm, newiperm, vrowdist, -1);
#ifdef HYPRE_DEBUG
PrintLine("ParILUT done", globals);
#endif
}
/*************************************************************************
* This function determines communication info. It assumes (and leaves)
* the map in a zero state. If memory requirements increase, it will
* free and reallocate memory for send/recieve buffers. Usually memory
* doesn't increase since the problem size is decreasing each iteration.
*
* The rrowind and srowind now have two bits packed into them, so
* (rowind>>2) is the index, rowind & 0x1 is lo, rowind & 0x2 is hi,
* where lo==1 means the lower half has this nonzero col index, hi==1 means
* the upper half has this nonzero col index.
**************************************************************************/
void ComputeCommInfo(ReduceMatType *rmat, CommInfoType *cinfo, int *rowdist,
hypre_PilutSolverGlobals *globals)
{
int i, ir, j, k, midpt, penum;
int nrecv, nsend, rnnbr, snnbr, maxnrecv, maxnsend, maxnrand;
int *rnz, *rcolind;
int *rrowind, *rnbrptr, *rnbrind, *srowind, *snbrind, *snbrptr;
MPI_Status Status ;
MPI_Request *index_requests;
#ifdef HYPRE_DEBUG
PrintLine("ComputeCommInfo", globals);
#endif
rnz = rmat->rmat_rnz;
rnbrind = cinfo->rnbrind;
rnbrptr = cinfo->rnbrptr;
rrowind = cinfo->rrowind;
snbrind = cinfo->snbrind;
snbrptr = cinfo->snbrptr;
/* Determine the indices that are needed */
nrecv = 0;
midpt = (firstrow + lastrow)/2;
for (ir=0; ir<ntogo; ir++) {
rcolind = rmat->rmat_rcolind[ir];
for (j=1; j<rnz[ir]; j++) {
k = rcolind[j];
CheckBounds(0, k, nrows, globals);
if ((k < firstrow || k >= lastrow) && map[k] == 0) {
map[k] = 1;
rrowind[nrecv++] = k;
}
}
}
/* Sort the indices to be received in increasing order */
sincsort_fast(nrecv, rrowind);
/* Determine processor boundaries in the rowind */
rnnbr = 0;
rnbrptr[0] = 0;
for (penum=0, j=0; penum<npes && j<nrecv; penum++) {
k = j;
for (; j<nrecv; j++) {
if (rrowind[j] >= rowdist[penum+1])
break;
}
if (j-k > 0) { /* Something for pe penum */
rnbrind[rnnbr] = penum;
rnbrptr[++rnnbr] = j;
}
}
cinfo->rnnbr = rnnbr;
/* reset the map afterwards */
for (i=0; i<nrecv; i++)
map[rrowind[i]] = 0;
/* Now you know from which processors, and what you need. */
cinfo->maxntogo = GlobalSEMax(ntogo, pilut_comm);
maxnrecv = rnnbr*(cinfo->maxntogo); /*GlobalSEMax(nrecv);*/
/* If memory requirements change, allocate new memory.
* The first iteration this always occurs -- see ParINIT */
if (cinfo->maxnrecv < maxnrecv) {
if (cinfo->incolind) free(cinfo->incolind);
if (cinfo->invalues) free(cinfo->invalues);
cinfo->incolind = idx_malloc(maxnrecv*(global_maxnz+2), "ComputeCommInfo: cinfo->incolind");
cinfo->invalues = fp_malloc(maxnrecv*(global_maxnz+2), "ComputeCommInfo: cinfo->invalues");
cinfo->maxnrecv = maxnrecv;
}
assert( cinfo->incolind != NULL );
assert( cinfo->invalues != NULL );
/* Zero our send buffer */
for(i=0; i<npes; i++)
pilu_send[i] = 0;
/* tell the processors in nbrind what I'm going to send them. */
for (i=0; i<rnnbr; i++)
pilu_send[rnbrind[i]] = rnbrptr[i+1]-rnbrptr[i]; /* The # of rows I need */
MPI_Alltoall( pilu_send, 1, MPI_INT,
pilu_recv, 1, MPI_INT, pilut_comm );
nsend = 0;
snnbr = 0;
snbrptr[0] = 0;
for (penum=0; penum<npes; penum++) {
if (pilu_recv[penum] > 0) {
nsend += pilu_recv[penum];
snbrind[snnbr] = penum;
snbrptr[++snnbr] = nsend;
}
}
cinfo->snnbr = snnbr;
/* Allocate requests */
index_requests = hypre_CTAlloc( MPI_Request, rnnbr );
maxnsend = GlobalSEMax(nsend, pilut_comm);
/* If memory requirements change, allocate new memory.
* The first iteration this always occurs -- see ParINIT */
if (cinfo->maxnsend < maxnsend) {
if(cinfo->srowind) free(cinfo->srowind);
cinfo->srowind = idx_malloc(maxnsend, "ComputeCommInfo: cinfo->srowind");
cinfo->maxnsend = maxnsend;
}
assert( cinfo->srowind != NULL );
srowind = cinfo->srowind;
/* issue asynchronous recieves */
for (i=0; i<snnbr; i++) {
MPI_Irecv( srowind+snbrptr[i], snbrptr[i+1]-snbrptr[i], MPI_INT,
snbrind[i], TAG_Comm_rrowind, pilut_comm, &index_requests[i] ) ;
}
/* OK, now I go and send the rrowind to the processor */
for (i=0; i<rnnbr; i++) {
MPI_Send( rrowind+rnbrptr[i], rnbrptr[i+1]-rnbrptr[i], MPI_INT,
rnbrind[i], TAG_Comm_rrowind, pilut_comm );
}
/* finalize recieves */
for (i=0; i<snnbr; i++) {
MPI_Wait( &index_requests[i], &Status ) ;
}
/* clean up memory */
hypre_TFree(index_requests);
}
/*************************************************************************
* This function returns what virtual PE the given row idx is located on.
**************************************************************************/
int Idx2PE(int idx,
hypre_PilutSolverGlobals *globals)
{
int penum = 0;
while (idx >= vrowdist[penum+1]) { /* idx >= lastrow? */
penum++;
assert( penum < npes );
}
return penum;
}
/*************************************************************************
* This function computes a set that is independant between PEs but may
* contain dependencies within a PE. This variant simply gives rows to
* the lowest PE possible, which creates some load imbalancing between
* the highest and lowest PEs. It also forms the new permutation and
* marks the _local_ rows that are in the set (but not remote rows).
* For historical reasons the set is called a maximal indep. set (MIS).
**************************************************************************/
int SelectSet(ReduceMatType *rmat, CommInfoType *cinfo,
int *perm, int *iperm,
int *newperm, int *newiperm,
hypre_PilutSolverGlobals *globals)
{
int ir, i, j, k, l, num, midpt, pe;
int nnz, snnbr;
int *rcolind, *snbrind, *snbrptr, *srowind;
#ifdef HYPRE_DEBUG
PrintLine("SelectSet", globals);
#endif
snnbr = cinfo->snnbr;
snbrind = cinfo->snbrind;
snbrptr = cinfo->snbrptr;
srowind = cinfo->srowind;
/* determine local rows that do not have non-zeros on higher numbered PEs. */
num = 0;
midpt = (firstrow+lastrow)/2;
for (ir=0; ir<ntogo; ir++) {
i = perm[ir+ndone]+firstrow;
rcolind = rmat->rmat_rcolind[ir];
nnz = rmat->rmat_rnz[ir];
for (j=1; j<nnz; j++) {
if ((rcolind[j] < firstrow || rcolind[j] >= lastrow) &&
mype > Idx2PE(rcolind[j], globals))
break ;
}
if ( j == nnz ) { /* passed test; put into set */
jw[num++] = i;
map[i] = 1; /* local doesn't need info in high bits */
}
}
/* check for asymetries -- the triangular solves depend on the set being block diagonal */
for (k=0; k<snnbr; k++)
if (snbrind[k] < mype)
for (i=snbrptr[k]; i<snbrptr[k+1]; i++)
for (j=0; j<num; j++)
if (srowind[i] == jw[j]) {
CheckBounds(firstrow, jw[j], lastrow, globals);
map[jw[j]] = 0;
jw[j] = jw[--num];
}
/* Compute the new permutation with MIS at beginning */
j = ndone;
k = ndone+num;
for (ir=ndone; ir<lnrows; ir++) {
l = perm[ir];
CheckBounds(0, l, lnrows, globals);
if (map[l+firstrow] == 1) { /* This is in MIS, put it into ldu */
CheckBounds(ndone, j, ndone+num, globals);
newperm[j] = l;
newiperm[l] = j++;
}
else {
CheckBounds(ndone+num, k, lnrows, globals);
newperm[k] = l;
newiperm[l] = k++;
}
}
#ifndef NDEBUG
/* DEBUGGING: check map is zero outside of local rows */
for (i=0; i<firstrow; i++)
assert(map[i] == 0);
for (i=lastrow; i<nrows; i++)
assert(map[i] == 0);
#endif
return num;
}
/*************************************************************************
* This function sends the factored rows to the appropriate processors. The
* rows are sent in the order of the _new_ MIS permutation. Each PE then
* uses the recieved information to mark _remote_ rows in the MIS. It takes
* as input the factored rows in LDU, the new permutation vectors, and the
* global map with local MIS rows already marked. This also updates the
* rnbrptr[i] to be the actual number of rows recieved from PE rnbrind[i].
* 3/20/98: Bug fix, lengths input to sgatherbuf increased by one to reflect
* fact that diagonal element is also transmitted. -AJC
**************************************************************************/
void SendFactoredRows(FactorMatType *ldu, CommInfoType *cinfo,
int *newperm, int nmis, hypre_PilutSolverGlobals *globals)
{
int i, j, k, ku, kg, l, penum, snnbr, rnnbr, cnt, inCnt;
int *snbrind, *rnbrind, *rnbrptr, *sgatherbuf, *incolind;
int *usrowptr, *uerowptr, *ucolind;
double *dgatherbuf, *uvalues, *dvalues, *invalues;
MPI_Status Status;
MPI_Request *index_requests, *value_requests ;
#ifdef HYPRE_DEBUG
PrintLine("SendFactoredRows", globals);
#endif
snnbr = cinfo->snnbr;
snbrind = cinfo->snbrind;
rnnbr = cinfo->rnnbr;
rnbrind = cinfo->rnbrind;
rnbrptr = cinfo->rnbrptr;
/* NOTE we cast a (double*) to an (int*) */
sgatherbuf = (int *)cinfo->gatherbuf;
dgatherbuf = cinfo->gatherbuf;
incolind = cinfo->incolind;
invalues = cinfo->invalues;
usrowptr = ldu->usrowptr;
uerowptr = ldu->uerowptr;
ucolind = ldu->ucolind;
uvalues = ldu->uvalues;
dvalues = ldu->dvalues;
/* Allocate requests */
index_requests = hypre_CTAlloc( MPI_Request, rnnbr );
value_requests = hypre_CTAlloc( MPI_Request, rnnbr );
/* Issue asynchronous receives for rows from other processors.
Asynchronous receives needed to avoid overflowing comm buffers. */
j = 0;
cnt = (cinfo->maxntogo)*(global_maxnz+2) ;
for (i=0; i<rnnbr; i++) {
penum = rnbrind[i];
MPI_Irecv( incolind+j, cnt, MPI_INT,
penum, TAG_Send_colind, pilut_comm, &index_requests[i] );
MPI_Irecv( invalues+j, cnt, MPI_DOUBLE,
penum, TAG_Send_values, pilut_comm, &value_requests[i] );
j += cnt;
}
/* pack the colind for sending*/
l = 0;
for (j=ndone; j<ndone+nmis; j++) {
k = newperm[j];
CheckBounds(firstrow, k+firstrow, lastrow, globals);
assert(IsInMIS(map[k+firstrow]));
CheckBounds(0, uerowptr[k]-usrowptr[k], global_maxnz+1, globals);
/* sgatherbuf[l++] = uerowptr[k]-usrowptr[k]; */ /* store length */
/* Bug fix, 3/20/98 */
sgatherbuf[l++] = uerowptr[k]-usrowptr[k]+1; /* store length */
sgatherbuf[l++] = k+firstrow; /* store row # */
for (ku=usrowptr[k], kg=l; ku<uerowptr[k]; ku++, kg++)
sgatherbuf[kg] = ucolind[ku];
l += global_maxnz;
}
/* send colind to each neighbor */
for (i=0; i<snnbr; i++) {
MPI_Send( sgatherbuf, l, MPI_INT,
snbrind[i], TAG_Send_colind, pilut_comm );
}
/* pack the values */
l = 0;
for (j=ndone; j<ndone+nmis; j++) {
k = newperm[j];
CheckBounds(firstrow, k+firstrow, lastrow, globals);
assert(IsInMIS(map[k+firstrow]));
l++; /* first element undefined */
dgatherbuf[l++] = dvalues[k]; /* store diagonal */
for (ku=usrowptr[k], kg=l; ku<uerowptr[k]; ku++, kg++)
dgatherbuf[kg] = uvalues[ku];
l += global_maxnz;
}
/* send values to each neighbor */
for (i=0; i<snnbr; i++) {
MPI_Send( dgatherbuf, l, MPI_DOUBLE,
snbrind[i], TAG_Send_values, pilut_comm );
}
/* Finish receiving rows */
j = 0;
cnt = (cinfo->maxntogo)*(global_maxnz+2) ;
for (i=0; i<rnnbr; i++) {
penum = rnbrind[i];
MPI_Wait( &index_requests[i], &Status);
/* save where each row is recieved into the map */
MPI_Get_count( &Status, MPI_INT, &inCnt );
rnbrptr[i] = inCnt;
for (k=0; k<inCnt; k += global_maxnz+2)
map[incolind[j+k+1]] = ((j+k)<<1) + 1; /* pack MIS flag in LSB */
MPI_Wait( &value_requests[i], &Status);
j += cnt;
CheckBounds(0, j, (cinfo->maxnrecv)*(global_maxnz+2)+2, globals);
}
/* clean up memory */
hypre_TFree(index_requests);
hypre_TFree(value_requests);
}
/*************************************************************************
* This function creates the new reduce matrix. It takes as input the
* current reduced matrix and the outside nodes sent from other PEs.
* Also both the old permutation (which applies to this rmat) and the new
* permutation (which applies to the new rmat) are taken as input. After
* each row is computed, the number of non-zeros is kept the same.
*
* Note that all fill elements into the L portion mus fill unto the same
* processor as the row being subtracted is, since it is block diagonal.
**************************************************************************/
void ComputeRmat(FactorMatType *ldu, ReduceMatType *rmat,
ReduceMatType *nrmat, CommInfoType *cinfo,
int *perm, int *iperm,
int *newperm, int *newiperm, int nmis, double tol,
hypre_PilutSolverGlobals *globals)
{
int i, ir, inr, start, j, k, kk, l, m, end, nnz;
int *usrowptr, *uerowptr, *ucolind, *incolind, *rcolind, rrowlen;
double *uvalues, *nrm2s, *invalues, *rvalues, *dvalues;
double mult, nval, rtol;
#ifdef HYPRE_DEBUG
PrintLine("ComputeRmat", globals);
#endif
usrowptr = ldu->usrowptr;
uerowptr = ldu->uerowptr;
ucolind = ldu->ucolind;
uvalues = ldu->uvalues;
dvalues = ldu->dvalues;
nrm2s = ldu->nrm2s;
incolind = cinfo->incolind;
invalues = cinfo->invalues;
/* OK, now reduce the remaining rows.
* inr counts the rows actually factored as an index for the nrmat */
inr = 0;
for (ir=ndone+nmis; ir<lnrows; ir++) {
i = newperm[ir];
CheckBounds(0, i, lnrows, globals);
assert(!IsInMIS(map[i+firstrow]));
rtol = nrm2s[i]*tol;
/* get the row according to the _previous_ permutation */
k = iperm[i]-ndone;
CheckBounds(0, k, ntogo, globals);
nnz = rmat->rmat_rnz[k];
rcolind = rmat->rmat_rcolind[k];
rvalues = rmat->rmat_rvalues[k];
rrowlen = rmat->rmat_rrowlen[k];
/* Initialize workspace and determine the L indices (ie., MIS).
* The L indices are stored as either the row's new local permutation
* or the permuted order we recieved the row. The LSB is a flag
* for being local (==0) or remote (==1). */
jr[rcolind[0]] = 0; /* store diagonal first */
jw[0] = rcolind[0];
w[0] = rvalues[0];
lastlr = 0;
for (lastjr=1; lastjr<nnz; lastjr++) {
CheckBounds(0, rcolind[lastjr], nrows, globals);
/* record L elements */
if (IsInMIS(map[rcolind[lastjr]])) {
if (rcolind[lastjr] >= firstrow && rcolind[lastjr] < lastrow)
lr[lastlr] = (newiperm[rcolind[lastjr]-firstrow] << 1);
else {
lr[lastlr] = map[rcolind[lastjr]]; /* map[] == (l<<1) | 1 */
assert(incolind[StripMIS(map[rcolind[lastjr]])+1] ==
rcolind[lastjr]);
}
lastlr++;
}
jr[rcolind[lastjr]] = lastjr;
jw[lastjr] = rcolind[lastjr];
w[lastjr] = rvalues[lastjr];
}
assert(lastjr == nnz);
assert(lastjr > 0);
/* Go through the L nonzeros and pull in the contributions */
while( lastlr != 0 ) {
k = ExtractMinLR( globals );
if ( IsLocal(k) ) { /* Local node -- row is in DU */
CheckBounds(0, StripLocal(k), lnrows, globals);
kk = newperm[ StripLocal(k) ]; /* remove the local bit (LSB) */
k = kk+firstrow;
CheckBounds(0, kk, lnrows, globals);
CheckBounds(0, jr[k], lastjr, globals);
assert(jw[jr[k]] == k);
mult = w[jr[k]]*dvalues[kk];
w[jr[k]] = mult;
if (fabs(mult) < rtol)
continue; /* First drop test */
for (l=usrowptr[kk]; l<uerowptr[kk]; l++) {
CheckBounds(0, ucolind[l], nrows, globals);
m = jr[ucolind[l]];
if (m == -1) {
if (fabs(mult*uvalues[l]) < rtol)
continue; /* Don't worry. The fill has too small of a value */
/* record L elements -- these must be local */
if (IsInMIS(map[ucolind[l]])) {
assert(ucolind[l] >= firstrow && ucolind[l] < lastrow);
lr[lastlr] = (newiperm[ucolind[l]-firstrow] << 1);
lastlr++;
}
/* Create fill */
jr[ucolind[l]] = lastjr;
jw[lastjr] = ucolind[l];
w[lastjr] = -mult*uvalues[l];
lastjr++;
}
else
w[m] -= mult*uvalues[l];
}
}
else { /* Outside node -- row is in incolind/invalues */
start = StripLocal(k); /* Remove the local bit (LSB) */
end = start + incolind[start]; /* get length */
start++;
k = incolind[start]; /* get diagonal colind == row index */
CheckBounds(0, k, nrows, globals);
CheckBounds(0, jr[k], lastjr, globals);
assert(jw[jr[k]] == k);
mult = w[jr[k]]*invalues[start];
w[jr[k]] = mult;
if (fabs(mult) < rtol)
continue; /* First drop test */
for (l=++start; l<=end; l++) {
CheckBounds(0, incolind[l], nrows, globals);
m = jr[incolind[l]];
if (m == -1) {
if (fabs(mult*invalues[l]) < rtol)
continue; /* Don't worry. The fill has too small of a value */
/* record L elements -- these must be remote */
if (IsInMIS(map[incolind[l]])) {
assert(incolind[l] < firstrow || incolind[l] >= lastrow);
lr[lastlr] = map[incolind[l]]; /* map[] == (l<<1) | 1 */
lastlr++;
}
/* Create fill */
jr[incolind[l]] = lastjr;
jw[lastjr] = incolind[l];
w[lastjr] = -mult*invalues[l];
lastjr++;
}
else
w[m] -= mult*invalues[l];
}
}
} /* L non-zeros */
/* perform SecondDrops and store in appropriate places */
SecondDropSmall( rtol, globals );
m = SeperateLU_byMIS( globals);
UpdateL( i, m, ldu, globals );
FormNRmat( inr++, m, nrmat, global_maxnz, rrowlen, rcolind, rvalues, globals );
/* FormNRmat( inr++, m, nrmat, 3*global_maxnz, rcolind, rvalues, globals ); */
}
}
/*************************************************************************
* This function performs a serial ILUT on the local MIS rows, then calls
* SecondDrop to drop some elements and create LDU. If the set is truly
* independant, then this just puts the row into DU. If there are
* dependencies within a PE this factors those, adding to L, and forms DU.
**************************************************************************/
void FactorLocal(FactorMatType *ldu, ReduceMatType *rmat,
ReduceMatType *nrmat, CommInfoType *cinfo,
int *perm, int *iperm,
int *newperm, int *newiperm, int nmis, double tol,
hypre_PilutSolverGlobals *globals)
{
int i, ir, j, k, kk, l, m, nnz, diag;
int *usrowptr, *uerowptr, *ucolind, *rcolind;
double *uvalues, *nrm2s, *rvalues, *dvalues;
double mult, nval, rtol;
#ifdef HYPRE_DEBUG
PrintLine("FactorLocal", globals);
#endif
assert( rmat != nrmat );
assert( perm != newperm );
assert( iperm != newiperm );
usrowptr = ldu->usrowptr;
uerowptr = ldu->uerowptr;
ucolind = ldu->ucolind;
uvalues = ldu->uvalues;
dvalues = ldu->dvalues;
nrm2s = ldu->nrm2s;
/* OK, now factor the nmis rows */
for (ir=ndone; ir<ndone+nmis; ir++) {
i = newperm[ir];
CheckBounds(0, i, lnrows, globals);
assert(IsInMIS(map[i+firstrow]));
rtol = nrm2s[i]*tol; /* Compute relative tolerance */
diag = newiperm[i];
/* get the row according to the _previous_ permutation */
k = iperm[i]-ndone;
CheckBounds(0, k, ntogo, globals);
nnz = rmat->rmat_rnz[k];
rcolind = rmat->rmat_rcolind[k];
rvalues = rmat->rmat_rvalues[k];
/* Initialize workspace and determines the L indices.
* Since there are only local nodes, we just store the
* row's new permutation into lr, without any flags. */
jr[rcolind[0]] = 0; /* store diagonal first */
jw[0] = rcolind[0];
w[0] = rvalues[0];
assert(jw[0] == i+firstrow);
lastlr = 0;
for (lastjr=1; lastjr<nnz; lastjr++) {
CheckBounds(0, rcolind[lastjr], nrows, globals);
/* record L elements */
if (rcolind[lastjr] >= firstrow &&
rcolind[lastjr] < lastrow &&
newiperm[rcolind[lastjr]-firstrow] < diag) {
lr[lastlr] = newiperm[rcolind[lastjr]-firstrow];
lastlr++;
}
jr[rcolind[lastjr]] = lastjr;
jw[lastjr] = rcolind[lastjr];
w[lastjr] = rvalues[lastjr];
}
/* Go through the L nonzeros and pull in the contributions */
while( lastlr != 0 ) {
k = ExtractMinLR(globals);
CheckBounds(0, k, lnrows, globals);
kk = newperm[ k ];
k = kk+firstrow;
CheckBounds(0, kk, lnrows, globals);
CheckBounds(0, jr[k], lastjr, globals);
assert(jw[jr[k]] == k);
mult = w[jr[k]]*dvalues[kk];
w[jr[k]] = mult;
if (fabs(mult) < rtol)
continue; /* First drop test */
for (l=usrowptr[kk]; l<uerowptr[kk]; l++) {
CheckBounds(0, ucolind[l], nrows, globals);
m = jr[ucolind[l]];
if (m == -1) {
if (fabs(mult*uvalues[l]) < rtol)
continue; /* Don't worry. The fill has too small of a value */
/* record L elements */
if (ucolind[l] >= firstrow &&
ucolind[l] < lastrow &&
newiperm[ucolind[l]-firstrow] < diag) {
assert(IsInMIS(map[ucolind[l]]));
lr[lastlr] = newiperm[ucolind[l]-firstrow];
lastlr++;
}
/* Create fill */
jr[ucolind[l]] = lastjr;
jw[lastjr] = ucolind[l];
w[lastjr] = -mult*uvalues[l];
lastjr++;
}
else
w[m] -= mult*uvalues[l];
}
} /* L non-zeros */
/* perform SecondDrops and store in appropriate places */
SecondDropSmall( rtol, globals );
m = SeperateLU_byDIAG( diag, newiperm, globals );
UpdateL( i, m, ldu, globals );
FormDU( i, m, ldu, rcolind, rvalues, tol, globals );
}
}
/*************************************************************************
* This function drops small values from the workspace, and also resets
* the jr[] array to all -1's.
**************************************************************************/
void SecondDropSmall( double rtol,
hypre_PilutSolverGlobals *globals )
{
int i;
/* Reset the jr array. */
for (i=0; i<lastjr; i++) {
CheckBounds(0, jw[i], nrows, globals);
jr[jw[i]] = -1;
}
/* Remove any (off-diagonal) elements of the row below the tolerance */
for (i=1; i<lastjr;) {
if (fabs(w[i]) < rtol) {
jw[i] = jw[--lastjr];
w[i] = w[lastjr];
}
else
i++;
}
}
/*****************************************************************
* This function seperates the L and U portions of the workspace
* and returns the point at which they seperate, so
* L entries are between [1 .. point)
* U or rmat entries are [point .. lastjr)
* We assume the diagonal D is index [0].
*
* This version compares the (new) permuted order of entries to the
* given permuted order of the row (diag) to determine entries in L.
* This is suitable for local factorizations.
******************************************************************/
int SeperateLU_byDIAG( int diag, int *newiperm,
hypre_PilutSolverGlobals *globals )
{
int first, last, itmp;
double dtmp;
/* Perform a Qsort type pass to seperate L and U (rmat) entries. */
if (lastjr == 1)
last = first = 1;
else {
last = 1;
first = lastjr-1;
while (true) {
while (last < first && /* while (last < first AND [last] is in L) */
(jw[last] >= firstrow &&
jw[last] < lastrow &&
newiperm[jw[last]-firstrow] < diag))
last++;
while (last < first && /* while (last < first AND [first] is not in L) */
! (jw[first] >= firstrow &&
jw[first] < lastrow &&
newiperm[jw[first]-firstrow] < diag))
first--;
if (last < first) {
SWAP(jw[first], jw[last], itmp);
SWAP( w[first], w[last], dtmp);
last++; first--;
}
if (last == first) {
if ((jw[last] >= firstrow && /* if [last] is in L */
jw[last] < lastrow &&
newiperm[jw[last]-firstrow] < diag)) {
first++;
last++;
}
break;
}
else if (last > first) {
first++;
break;
}
}
}
#ifndef NDEBUG
/* DEBUGGING: verify sorting to some extent */
for (itmp=1; itmp<last; itmp++) {
assert((jw[itmp] >= firstrow && /* [itmp] is in L -- must be MIS */
jw[itmp] < lastrow &&
newiperm[jw[itmp]-firstrow] < diag));
assert(IsInMIS(map[jw[itmp]]));
}
for (itmp=first; itmp<lastjr; itmp++) {
assert(!(jw[itmp] >= firstrow && /* [itmp] is not in L -- may be MIS still */
jw[itmp] < lastrow &&
newiperm[jw[itmp]-firstrow] < diag));
}
assert(last == first);
#endif
return first;
}
/*****************************************************************
* This function seperates the L and U portions of the workspace
* and returns the point at which they seperate, so
* L entries are between [1 .. point)
* U or rmat entries are [point .. lastjr)
* We assume the diagonal D is index [0].
*
* This version simply uses the MIS to determine entries in L.
* This is suitable for reductions involving rows on other PEs,
* where -every- row in the MIS will be part of L.
******************************************************************/
int SeperateLU_byMIS( hypre_PilutSolverGlobals *globals )
{
int first, last, itmp;
double dtmp;
/* Perform a Qsort type pass to seperate L and U (rmat) entries. */
if (lastjr == 1)
last = first = 1;
else {
last = 1;
first = lastjr-1;
while (true) {
while (last < first && IsInMIS(map[jw[last ]])) /* and [last] is in L */
last++;
while (last < first && ! IsInMIS(map[jw[first]])) /* and [first] is not in L */
first--;
if (last < first) {
SWAP(jw[first], jw[last], itmp);
SWAP( w[first], w[last], dtmp);
last++; first--;
}
if (last == first) {
if (IsInMIS(map[jw[last]])) {
first++;
last++;
}
break;
}
else if (last > first) {
first++;
break;
}
}
}
#ifndef NDEBUG
/* DEBUGGING: verify sorting to some extent */
for (itmp=1; itmp<last; itmp++)
assert(IsInMIS(map[jw[itmp]]));
for (itmp=first; itmp<lastjr; itmp++)
assert(!IsInMIS(map[jw[itmp]]));
assert(last == first);
#endif
return first;
}
/*************************************************************************
* This function updates the L part of the given row, assuming that the
* workspace has already been split into L and U entries. L may already
* be partially or completely full--this fills it and then starts to
* replace the min value.
**************************************************************************/
void UpdateL(int lrow, int last, FactorMatType *ldu,
hypre_PilutSolverGlobals *globals)
{
int i, j, min, start, end;
int *lcolind;
double *lvalues;
lcolind = ldu->lcolind;
lvalues = ldu->lvalues;
start = ldu->lsrowptr[lrow];
end = ldu->lerowptr[lrow];
/* The entries between [1, last) are part of L */
for (i=1; i<last; i++) {
if (end-start < global_maxnz) { /* In case we did not have maxnz in L */
lcolind[end] = jw[i];
lvalues[end] = w[i];
end++;
}
else {
min = start; /* find min and replace if i is larger */
for (j=start+1; j<end; j++) {
if (fabs(lvalues[j]) < fabs(lvalues[min]))
min = j;
}
if (fabs(lvalues[min]) < fabs(w[i])) {
lcolind[min] = jw[i];
lvalues[min] = w[i];
}
}
}
ldu->lerowptr[lrow] = end;
CheckBounds(0, end-start, global_maxnz+1, globals);
}
/*************************************************************************
* This function forms the new reduced row corresponding to
* the given row, assuming that the
* workspace has already been split into L and U (rmat) entries. It reuses
* the memory for the row in the reduced matrix, storing the new row into
* nrmat->*[rrow].
* New version allows new row to be larger than original row, so it does not
* necessarily reuse the same memory. AC 3-18
**************************************************************************/
void FormNRmat(int rrow, int first, ReduceMatType *nrmat,
int max_rowlen,
int in_rowlen, int *in_colind, double *in_values,
hypre_PilutSolverGlobals *globals )
{
int nz, max, j, out_rowlen, *rcolind;
double *rvalues;
assert(in_colind[0] == jw[0]); /* diagonal at the beginning */
/* check to see if we need to reallocate space */
out_rowlen = hypre_min( max_rowlen, lastjr-first+1 );
if( out_rowlen > in_rowlen )
{
free_multi( in_colind, in_values, -1 );
rcolind = idx_malloc( out_rowlen, "FornNRmat: rcolind");
rvalues = fp_malloc( out_rowlen, "FornNRmat: rvalues");
}else
{
rcolind = in_colind;
rvalues = in_values;
}
rcolind[0] = jw[0];
rvalues[0] = w[0];
/* The entries [first, lastjr) are part of U (rmat) */
if (lastjr-first+1 <= max_rowlen) { /* Simple copy */
for (nz=1, j=first; j<lastjr; nz++, j++) {
rcolind[nz] = jw[j];
rvalues[nz] = w[j];
}
assert(nz == lastjr-first+1);
}
else { /* Keep largest out_rowlen elements in the reduced row */
for (nz=1; nz<out_rowlen; nz++) {
max = first;
for (j=first+1; j<lastjr; j++) {
if (fabs(w[j]) > fabs(w[max]))
max = j;
}
rcolind[nz] = jw[max]; /* store max */
rvalues[nz] = w[max];
jw[max] = jw[--lastjr]; /* swap max out */
w[max] = w[ lastjr];
}
assert(nz == out_rowlen);
}
assert(nz <= max_rowlen);
/* link the reused storage to the new reduced system */
nrmat->rmat_rnz[rrow] = nz;
nrmat->rmat_rrowlen[rrow] = out_rowlen;
nrmat->rmat_rcolind[rrow] = rcolind;
nrmat->rmat_rvalues[rrow] = rvalues;
}
/*************************************************************************
* This function forms the DU part of the given row, assuming that the
* workspace has already been split into L and U entries. It disposes of
* the memory used by the row in the reduced matrix.
**************************************************************************/
void FormDU(int lrow, int first, FactorMatType *ldu,
int *rcolind, double *rvalues, double tol,
hypre_PilutSolverGlobals *globals )
{
int nz, max, j, end;
int *ucolind, *uerowptr;
double *uvalues;
ucolind = ldu->ucolind;
uerowptr = ldu->uerowptr;
uvalues = ldu->uvalues;
/*
* Take care of the diagonal
*/
if (w[0] == 0.0) {
printf("Zero pivot in row %d, adding e to proceed!\n", lrow);
ldu->dvalues[lrow] = 1.0/tol;
}
else
ldu->dvalues[lrow] = 1.0/w[0];
/*
* Take care of the elements of U
* Note U is completely empty beforehand.
*/
end = ldu->uerowptr[lrow];
assert(ldu->usrowptr[lrow] == ldu->uerowptr[lrow]);
for (nz=0; nz<global_maxnz && lastjr>first; nz++) {
/* The entries [first, lastjr) are part of U */
max = first;
for (j=first+1; j<lastjr; j++) {
if (fabs(w[j]) > fabs(w[max]))
max = j;
}
ucolind[end] = jw[max]; /* store max */
uvalues[end] = w[max];
end++;
jw[max] = jw[--lastjr]; /* swap max out */
w[max] = w[ lastjr];
}
uerowptr[lrow] = end;
/* free the row storage */
free( rcolind );
free( rvalues );
}
/*************************************************************************
* This function zeros the map for all local rows and rows we recieved.
* During debugging it checks the entire map to ensure other entries remain
* zero as expected. cinfo->rnbrptr[i] has the _actual_ number of rows
* recieved from PE rnbrind[i], which is set in SendFactoredRows.
**************************************************************************/
void EraseMap(CommInfoType *cinfo, int *newperm, int nmis,
hypre_PilutSolverGlobals *globals)
{
int i, j, k, cnt, rnnbr;
int *rnbrptr, *incolind;
rnnbr = cinfo->rnnbr;
rnbrptr = cinfo->rnbrptr;
incolind = cinfo->incolind;
#ifdef HYPRE_DEBUG
PrintLine("EraseMap", globals);
#endif
/* clear map of all MIS rows */
for (i=ndone; i<ndone+nmis; i++)
map[newperm[i]+firstrow] = 0;
/* clear map of all recieved rows. see SendFactoredRows code */
j = 1; /* row index in [1] */
cnt = (cinfo->maxntogo)*(global_maxnz+2) ;
for (i=0; i<rnnbr; i++) {
for (k=0; k<rnbrptr[i]; k += global_maxnz+2)
map[incolind[j+k]] = 0;
j += cnt;
}
#ifndef NDEBUG
/* DEBUGGING: check entire map */
for (i=0; i<nrows; i++)
if ( map[i] != 0 ) {
printf("PE %d BAD ERASE %d [%d %d]\n", mype, i, firstrow, lastrow);
map[i] = 0;
}
#endif
}
/*************************************************************************
* This function allocates datastructures for the new reduced matrix (nrmat),
* the global workspace, and the communication info. Some parts of the
* comm info are allocated dynamically so we just initialize their size to
* zero here, forcing an allocation the first time ComputeCommInfo is called.
* Comments indicate where in George's code these originally existed.
**************************************************************************/
void ParINIT( ReduceMatType *nrmat, CommInfoType *cinfo, int *rowdist,
hypre_PilutSolverGlobals *globals )
{
int i;
#ifdef HYPRE_DEBUG
PrintLine("ParINIT", globals);
#endif
/* save a global copy of the row distribution */
vrowdist = idx_malloc(npes+1, "ParINIT: vrowdist");
memcpy_idx(vrowdist, rowdist, npes+1);
/* ---- ParILUT ---- */
/* Allocate the new rmat */
nrmat->rmat_rnz = idx_malloc(ntogo, "ParILUT: nrmat->rmat_rnz" );
nrmat->rmat_rrowlen = idx_malloc(ntogo, "ParILUT: nrmat->rmat_rrowlen");
nrmat->rmat_rcolind = (int **) mymalloc( sizeof(int*)*ntogo, "ParILUT: nrmat->rmat_rcolind");
nrmat->rmat_rvalues = (double **) mymalloc( sizeof(double*) *ntogo, "ParILUT: nrmat->rmat_rvalues");
/* Allocate work space */
jr = idx_malloc_init(nrows, -1, "ParILUT: jr");
lr = idx_malloc_init(nleft, -1, "ParILUT: lr");
jw = idx_malloc(nleft, "ParILUT: jw");
w = fp_malloc(nleft, "ParILUT: w");
/* ---- ComputeCommInfo ---- */
/* Allocate global map */
map = idx_malloc_init(nrows, 0, "ComputeCommInfo: map");
/* Allocate cinfo */
cinfo->rnbrind = idx_malloc(npes, "ComputeCommInfo: cinfo->rnbrind");
cinfo->rrowind = idx_malloc(nleft, "ComputeCommInfo: cinfo->rrowind");
cinfo->rnbrptr = idx_malloc(npes+1, "ComputeCommInfo: cinfo->rnbrptr");
cinfo->snbrind = idx_malloc(npes, "ComputeCommInfo: cinfo->snbrind");
cinfo->snbrptr = idx_malloc(npes+1, "ComputeCommInfo: cinfo->snbrptr");
/* force allocates within ComputeCommInfo */
cinfo->incolind = NULL;
cinfo->invalues = NULL;
cinfo->srowind = NULL;
cinfo->maxnrecv = 0;
cinfo->maxnsend = 0;
/* ---- ComputeMIS ---- */
cinfo->gatherbuf = fp_malloc(ntogo*(global_maxnz+2), "ComputeMIS: gatherbuf");
}