hypre/distributed_ls/ParaSails/Matrix.c

878 lines
24 KiB
C

/*BHEADER**********************************************************************
* Copyright (c) 2007, Lawrence Livermore National Security, LLC.
* Produced at the Lawrence Livermore National Laboratory.
* Written by the HYPRE team. UCRL-CODE-222953.
* All rights reserved.
*
* This file is part of HYPRE (see http://www.llnl.gov/CASC/hypre/).
* Please see the COPYRIGHT_and_LICENSE file for the copyright notice,
* disclaimer, contact information and the GNU Lesser General Public License.
*
* HYPRE is free software; you can redistribute it and/or modify it under the
* terms of the GNU General Public License (as published by the Free Software
* Foundation) version 2.1 dated February 1999.
*
* HYPRE is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General
* Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* $Revision$
***********************************************************************EHEADER*/
/******************************************************************************
*
* Matrix - Matrix stored and accessible by rows. Indices and values for
* the matrix nonzeros are copied into the matrix a row at a time, in any
* order using the MatrixGetRow function. The MatrixPutRow function returns
* a pointer to the indices and values of a row. The matrix has a set of
* row and column indices such that these indices begin at "beg" and end
* at "end", where 0 <= "beg" <= "end". In other words, the matrix indices
* have any nonnegative base value, and the base values of the row and column
* indices must agree.
*
*****************************************************************************/
#include <stdlib.h>
#include <memory.h>
#include <assert.h>
#include "Common.h"
#include "Matrix.h"
#include "Numbering.h"
#define MAX_NZ_PER_ROW 1000
/*--------------------------------------------------------------------------
* MatrixCreate - Return (a pointer to) a matrix object.
*--------------------------------------------------------------------------*/
Matrix *MatrixCreate(MPI_Comm comm, int beg_row, int end_row)
{
int num_rows, mype, npes;
Matrix *mat = (Matrix *) malloc(sizeof(Matrix));
mat->comm = comm;
mat->beg_row = beg_row;
mat->end_row = end_row;
mat->mem = (Mem *) MemCreate();
num_rows = mat->end_row - mat->beg_row + 1;
mat->lens = (int *) MemAlloc(mat->mem, num_rows * sizeof(int));
mat->inds = (int **) MemAlloc(mat->mem, num_rows * sizeof(int *));
mat->vals = (double **) MemAlloc(mat->mem, num_rows * sizeof(double *));
/* Send beg_row and end_row to all processors */
/* This is needed in order to map row numbers to processors */
MPI_Comm_rank(comm, &mype);
MPI_Comm_size(comm, &npes);
mat->beg_rows = (int *) MemAlloc(mat->mem, npes * sizeof(int));
mat->end_rows = (int *) MemAlloc(mat->mem, npes * sizeof(int));
MPI_Allgather(&beg_row, 1, MPI_INT, mat->beg_rows, 1, MPI_INT, comm);
MPI_Allgather(&end_row, 1, MPI_INT, mat->end_rows, 1, MPI_INT, comm);
mat->num_recv = 0;
mat->num_send = 0;
mat->recv_req = NULL;
mat->send_req = NULL;
mat->recv_req2 = NULL;
mat->send_req2 = NULL;
mat->statuses = NULL;
mat->sendind = NULL;
mat->sendbuf = NULL;
mat->recvbuf = NULL;
mat->numb = NULL;
return mat;
}
/*--------------------------------------------------------------------------
* MatrixCreateLocal - Return (a pointer to) a matrix object.
* The matrix created by this call is a local matrix, not a global matrix.
*--------------------------------------------------------------------------*/
Matrix *MatrixCreateLocal(int beg_row, int end_row)
{
int num_rows;
Matrix *mat = (Matrix *) malloc(sizeof(Matrix));
mat->comm = MPI_COMM_NULL;
mat->beg_row = beg_row;
mat->end_row = end_row;
mat->mem = (Mem *) MemCreate();
num_rows = mat->end_row - mat->beg_row + 1;
mat->lens = (int *) MemAlloc(mat->mem, num_rows * sizeof(int));
mat->inds = (int **) MemAlloc(mat->mem, num_rows * sizeof(int *));
mat->vals = (double **) MemAlloc(mat->mem, num_rows * sizeof(double *));
/* Send beg_row and end_row to all processors */
/* This is needed in order to map row numbers to processors */
mat->beg_rows = NULL;
mat->end_rows = NULL;
mat->num_recv = 0;
mat->num_send = 0;
mat->recv_req = NULL;
mat->send_req = NULL;
mat->recv_req2 = NULL;
mat->send_req2 = NULL;
mat->statuses = NULL;
mat->sendind = NULL;
mat->sendbuf = NULL;
mat->recvbuf = NULL;
mat->numb = NULL;
return mat;
}
/*--------------------------------------------------------------------------
* MatrixDestroy - Destroy a matrix object "mat".
*--------------------------------------------------------------------------*/
void MatrixDestroy(Matrix *mat)
{
int i;
for (i=0; i<mat->num_recv; i++)
MPI_Request_free(&mat->recv_req[i]);
for (i=0; i<mat->num_send; i++)
MPI_Request_free(&mat->send_req[i]);
free(mat->recv_req);
free(mat->send_req);
free(mat->recv_req2);
free(mat->send_req2);
free(mat->statuses);
free(mat->sendind);
free(mat->sendbuf);
free(mat->recvbuf);
MemDestroy(mat->mem);
if (mat->numb)
NumberingDestroy(mat->numb);
free(mat);
}
/*--------------------------------------------------------------------------
* MatrixSetRow - Set a row in a matrix. Only local rows can be set.
* Once a row has been set, it should not be set again, or else the
* memory used by the existing row will not be recovered until
* the matrix is destroyed. "row" is in global coordinate numbering.
*--------------------------------------------------------------------------*/
void MatrixSetRow(Matrix *mat, int row, int len, int *ind, double *val)
{
row -= mat->beg_row;
mat->lens[row] = len;
mat->inds[row] = (int *) MemAlloc(mat->mem, len*sizeof(int));
mat->vals[row] = (double *) MemAlloc(mat->mem, len*sizeof(double));
if (ind != NULL)
memcpy(mat->inds[row], ind, len*sizeof(int));
if (val != NULL)
memcpy(mat->vals[row], val, len*sizeof(double));
}
/*--------------------------------------------------------------------------
* MatrixGetRow - Get a *local* row in a matrix.
*--------------------------------------------------------------------------*/
void MatrixGetRow(Matrix *mat, int row, int *lenp, int **indp, double **valp)
{
*lenp = mat->lens[row];
*indp = mat->inds[row];
*valp = mat->vals[row];
}
/*--------------------------------------------------------------------------
* MatrixRowPe - Map "row" to a processor number.
*--------------------------------------------------------------------------*/
int MatrixRowPe(Matrix *mat, int row)
{
int npes, pe;
int *beg = mat->beg_rows;
int *end = mat->end_rows;
MPI_Comm_size(mat->comm, &npes);
for (pe=0; pe<npes; pe++)
{
if (row >= beg[pe] && row <= end[pe])
return pe;
}
printf("MatrixRowPe: could not map row %d.\n", row);
PARASAILS_EXIT;
return -1; /* for picky compilers */
}
/*--------------------------------------------------------------------------
* MatrixNnz - Return total number of nonzeros in preconditioner.
*--------------------------------------------------------------------------*/
int MatrixNnz(Matrix *mat)
{
int num_local, i, total, alltotal;
num_local = mat->end_row - mat->beg_row + 1;
total = 0;
for (i=0; i<num_local; i++)
total += mat->lens[i];
MPI_Allreduce(&total, &alltotal, 1, MPI_INT, MPI_SUM, mat->comm);
return alltotal;
}
/*--------------------------------------------------------------------------
* MatrixPrint - Print a matrix to a file "filename". Each processor
* appends to the file in order, but the file is overwritten if it exists.
*--------------------------------------------------------------------------*/
void MatrixPrint(Matrix *mat, char *filename)
{
int mype, npes, pe;
int row, i, len, *ind;
double *val;
MPI_Comm_rank(mat->comm, &mype);
MPI_Comm_size(mat->comm, &npes);
for (pe=0; pe<npes; pe++)
{
MPI_Barrier(mat->comm);
if (mype == pe)
{
FILE *file = fopen(filename, (pe==0 ? "w" : "a"));
assert(file != NULL);
for (row=0; row<=mat->end_row - mat->beg_row; row++)
{
MatrixGetRow(mat, row, &len, &ind, &val);
for (i=0; i<len; i++)
fprintf(file, "%d %d %.14e\n",
row + mat->beg_row,
mat->numb->local_to_global[ind[i]], val[i]);
}
fclose(file);
}
}
}
/*--------------------------------------------------------------------------
* MatrixReadMaster - MatrixRead routine for processor 0. Internal use.
*--------------------------------------------------------------------------*/
static void MatrixReadMaster(Matrix *mat, char *filename)
{
MPI_Comm comm = mat->comm;
int mype, npes;
FILE *file;
int ret;
int num_rows, curr_proc;
int row, col;
double value;
long offset;
long outbuf;
int curr_row;
int len;
int ind[MAX_NZ_PER_ROW];
double val[MAX_NZ_PER_ROW];
char line[100];
int oldrow;
MPI_Request request;
MPI_Status status;
MPI_Comm_size(mat->comm, &npes);
MPI_Comm_rank(mat->comm, &mype);
file = fopen(filename, "r");
assert(file != NULL);
fgets(line, 100, file);
#ifdef EMSOLVE
ret = sscanf(line, "%*d %d %*d %*d", &num_rows);
for (row=0; row<num_rows; row++)
fscanf(file, "%*d");
#else
ret = sscanf(line, "%d %*d %*d", &num_rows);
#endif
offset = ftell(file);
fscanf(file, "%d %d %lf", &row, &col, &value);
request = MPI_REQUEST_NULL;
curr_proc = 1; /* proc for which we are looking for the beginning */
while (curr_proc < npes)
{
if (row == mat->beg_rows[curr_proc])
{
MPI_Wait(&request, &status);
outbuf = offset;
MPI_Isend(&outbuf, 1, MPI_LONG, curr_proc, 0, comm, &request);
curr_proc++;
}
offset = ftell(file);
oldrow = row;
fscanf(file, "%d %d %lf", &row, &col, &value);
if (oldrow > row)
{
fprintf(stderr, "Matrix file is not sorted by rows.\n");
PARASAILS_EXIT;
}
}
/* Now read our own part */
rewind(file);
fgets(line, 100, file);
#ifdef EMSOLVE
ret = sscanf(line, "%*d %d %*d %*d", &num_rows);
for (row=0; row<num_rows; row++)
fscanf(file, "%*d");
#else
ret = sscanf(line, "%d %*d %*d", &num_rows);
#endif
ret = fscanf(file, "%d %d %lf", &row, &col, &value);
curr_row = row;
len = 0;
while (ret != EOF && row <= mat->end_row)
{
if (row != curr_row)
{
/* store this row */
MatrixSetRow(mat, curr_row, len, ind, val);
curr_row = row;
/* reset row pointer */
len = 0;
}
if (len >= MAX_NZ_PER_ROW)
{
fprintf(stderr, "The matrix has exceeded %d\n", MAX_NZ_PER_ROW);
fprintf(stderr, "nonzeros per row. Internal buffers must be\n");
fprintf(stderr, "increased to continue.\n");
PARASAILS_EXIT;
}
ind[len] = col;
val[len] = value;
len++;
ret = fscanf(file, "%d %d %lf", &row, &col, &value);
}
/* Store the final row */
if (ret == EOF || row > mat->end_row)
MatrixSetRow(mat, mat->end_row, len, ind, val);
fclose(file);
MPI_Wait(&request, &status);
}
/*--------------------------------------------------------------------------
* MatrixReadSlave - MatrixRead routine for other processors. Internal use.
*--------------------------------------------------------------------------*/
static void MatrixReadSlave(Matrix *mat, char *filename)
{
MPI_Comm comm = mat->comm;
MPI_Status status;
int mype;
FILE *file;
int ret;
int row, col;
double value;
long offset;
int curr_row;
int len;
int ind[MAX_NZ_PER_ROW];
double val[MAX_NZ_PER_ROW];
double time0, time1;
file = fopen(filename, "r");
assert(file != NULL);
MPI_Comm_rank(mat->comm, &mype);
MPI_Recv(&offset, 1, MPI_LONG, 0, 0, comm, &status);
time0 = MPI_Wtime();
ret = fseek(file, offset, SEEK_SET);
assert(ret == 0);
ret = fscanf(file, "%d %d %lf", &row, &col, &value);
curr_row = row;
len = 0;
while (ret != EOF && row <= mat->end_row)
{
if (row != curr_row)
{
/* store this row */
MatrixSetRow(mat, curr_row, len, ind, val);
curr_row = row;
/* reset row pointer */
len = 0;
}
if (len >= MAX_NZ_PER_ROW)
{
fprintf(stderr, "The matrix has exceeded %d\n", MAX_NZ_PER_ROW);
fprintf(stderr, "nonzeros per row. Internal buffers must be\n");
fprintf(stderr, "increased to continue.\n");
PARASAILS_EXIT;
}
ind[len] = col;
val[len] = value;
len++;
ret = fscanf(file, "%d %d %lf", &row, &col, &value);
}
/* Store the final row */
if (ret == EOF || row > mat->end_row)
MatrixSetRow(mat, mat->end_row, len, ind, val);
fclose(file);
time1 = MPI_Wtime();
printf("%d: Time for slave read: %f\n", mype, time1-time0);
}
/*--------------------------------------------------------------------------
* MatrixRead - Read a matrix file "filename" from disk and store in the
* matrix "mat" which has already been created using MatrixCreate. The format
* assumes no nonzero rows, the rows are in order, and there will be at least
* one row per processor.
*--------------------------------------------------------------------------*/
void MatrixRead(Matrix *mat, char *filename)
{
int mype;
double time0, time1;
MPI_Comm_rank(mat->comm, &mype);
time0 = MPI_Wtime();
if (mype == 0)
MatrixReadMaster(mat, filename);
else
MatrixReadSlave(mat, filename);
time1 = MPI_Wtime();
printf("%d: Time for reading matrix: %f\n", mype, time1-time0);
MatrixComplete(mat);
}
/*--------------------------------------------------------------------------
* RhsRead - Read a right-hand side file "filename" from disk and store in the
* location pointed to by "rhs". "mat" is needed to provide the partitioning
* information. The expected format is: a header line (n, nrhs) followed
* by n values. Also allows isis format, indicated by 1 int in first line.
*--------------------------------------------------------------------------*/
void RhsRead(double *rhs, Matrix *mat, char *filename)
{
FILE *file;
MPI_Status status;
int mype, npes;
int num_rows, num_local, pe, i, converted;
double *buffer = NULL;
int buflen = 0;
char line[100];
int dummy;
MPI_Comm_size(mat->comm, &npes);
MPI_Comm_rank(mat->comm, &mype);
num_local = mat->end_row - mat->beg_row + 1;
if (mype != 0)
{
MPI_Recv(rhs, num_local, MPI_DOUBLE, 0, 0, mat->comm, &status);
return;
}
file = fopen(filename, "r");
assert(file != NULL);
fgets(line, 100, file);
converted = sscanf(line, "%d %d", &num_rows, &dummy);
assert(num_rows == mat->end_rows[npes-1]);
/* Read own rows first */
for (i=0; i<num_local; i++)
if (converted == 1) /* isis format */
fscanf(file, "%*d %lf", &rhs[i]);
else
fscanf(file, "%lf", &rhs[i]);
for (pe=1; pe<npes; pe++)
{
num_local = mat->end_rows[pe] - mat->beg_rows[pe]+ 1;
if (buflen < num_local)
{
free(buffer);
buflen = num_local;
buffer = (double *) malloc(buflen * sizeof(double));
}
for (i=0; i<num_local; i++)
if (converted == 1) /* isis format */
fscanf(file, "%*d %lf", &buffer[i]);
else
fscanf(file, "%lf", &buffer[i]);
MPI_Send(buffer, num_local, MPI_DOUBLE, pe, 0, mat->comm);
}
free(buffer);
}
/*--------------------------------------------------------------------------
* SetupReceives
*--------------------------------------------------------------------------*/
static void SetupReceives(Matrix *mat, int reqlen, int *reqind, int *outlist)
{
int i, j, this_pe, mype;
MPI_Request request;
MPI_Comm comm = mat->comm;
int num_local = mat->end_row - mat->beg_row + 1;
MPI_Comm_rank(comm, &mype);
mat->num_recv = 0;
/* Allocate recvbuf */
/* recvbuf has numlocal entires saved for local part of x, used in matvec */
mat->recvlen = reqlen; /* used for the transpose multiply */
mat->recvbuf = (double *) malloc((reqlen+num_local) * sizeof(double));
for (i=0; i<reqlen; i=j) /* j is set below */
{
/* The processor that owns the row with index reqind[i] */
this_pe = MatrixRowPe(mat, reqind[i]);
/* Figure out other rows we need from this_pe */
for (j=i+1; j<reqlen; j++)
{
/* if row is on different pe */
if (reqind[j] < mat->beg_rows[this_pe] ||
reqind[j] > mat->end_rows[this_pe])
break;
}
/* Request rows in reqind[i..j-1] */
MPI_Isend(&reqind[i], j-i, MPI_INT, this_pe, 444, comm, &request);
MPI_Request_free(&request);
/* Count of number of number of indices needed from this_pe */
outlist[this_pe] = j-i;
MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, MPI_DOUBLE, this_pe, 555,
comm, &mat->recv_req[mat->num_recv]);
MPI_Send_init(&mat->recvbuf[i+num_local], j-i, MPI_DOUBLE, this_pe, 666,
comm, &mat->send_req2[mat->num_recv]);
mat->num_recv++;
}
}
/*--------------------------------------------------------------------------
* SetupSends
* This function will wait for all receives to complete.
*--------------------------------------------------------------------------*/
static void SetupSends(Matrix *mat, int *inlist)
{
int i, j, mype, npes;
MPI_Request *requests;
MPI_Status *statuses;
MPI_Comm comm = mat->comm;
MPI_Comm_rank(comm, &mype);
MPI_Comm_size(comm, &npes);
requests = (MPI_Request *) malloc(npes * sizeof(MPI_Request));
statuses = (MPI_Status *) malloc(npes * sizeof(MPI_Status));
/* Determine size of and allocate sendbuf and sendind */
mat->sendlen = 0;
for (i=0; i<npes; i++)
mat->sendlen += inlist[i];
mat->sendbuf = NULL;
mat->sendind = NULL;
if (mat->sendlen)
{
mat->sendbuf = (double *) malloc(mat->sendlen * sizeof(double));
mat->sendind = (int *) malloc(mat->sendlen * sizeof(int));
}
j = 0;
mat->num_send = 0;
for (i=0; i<npes; i++)
{
if (inlist[i] != 0)
{
/* Post receive for the actual indices */
MPI_Irecv(&mat->sendind[j], inlist[i], MPI_INT, i, 444, comm,
&requests[mat->num_send]);
/* Set up the send */
MPI_Send_init(&mat->sendbuf[j], inlist[i], MPI_DOUBLE, i, 555, comm,
&mat->send_req[mat->num_send]);
/* Set up the receive for the transpose */
MPI_Recv_init(&mat->sendbuf[j], inlist[i], MPI_DOUBLE, i, 666, comm,
&mat->recv_req2[mat->num_send]);
mat->num_send++;
j += inlist[i];
}
}
MPI_Waitall(mat->num_send, requests, statuses);
free(requests);
free(statuses);
/* convert global indices to local indices */
/* these are all indices on this processor */
for (i=0; i<mat->sendlen; i++)
mat->sendind[i] -= mat->beg_row;
}
/*--------------------------------------------------------------------------
* MatrixComplete
*--------------------------------------------------------------------------*/
void MatrixComplete(Matrix *mat)
{
int mype, npes;
int *outlist, *inlist;
int row, len, *ind;
double *val;
MPI_Comm_rank(mat->comm, &mype);
MPI_Comm_size(mat->comm, &npes);
mat->recv_req = (MPI_Request *) malloc(npes * sizeof(MPI_Request));
mat->send_req = (MPI_Request *) malloc(npes * sizeof(MPI_Request));
mat->recv_req2 = (MPI_Request *) malloc(npes * sizeof(MPI_Request));
mat->send_req2 = (MPI_Request *) malloc(npes * sizeof(MPI_Request));
mat->statuses = (MPI_Status *) malloc(npes * sizeof(MPI_Status));
outlist = (int *) calloc(npes, sizeof(int));
inlist = (int *) calloc(npes, sizeof(int));
/* Create Numbering object */
mat->numb = NumberingCreate(mat, PARASAILS_NROWS);
SetupReceives(mat, mat->numb->num_ind - mat->numb->num_loc,
&mat->numb->local_to_global[mat->numb->num_loc], outlist);
MPI_Alltoall(outlist, 1, MPI_INT, inlist, 1, MPI_INT, mat->comm);
SetupSends(mat, inlist);
free(outlist);
free(inlist);
/* Convert to local indices */
for (row=0; row<=mat->end_row - mat->beg_row; row++)
{
MatrixGetRow(mat, row, &len, &ind, &val);
NumberingGlobalToLocal(mat->numb, len, ind, ind);
}
}
/*--------------------------------------------------------------------------
* MatrixMatvec
* Can be done in place.
*--------------------------------------------------------------------------*/
void MatrixMatvec(Matrix *mat, double *x, double *y)
{
int row, i, len, *ind;
double *val, temp;
int num_local = mat->end_row - mat->beg_row + 1;
/* Set up persistent communications */
/* Assumes MatrixComplete has been called */
/* Put components of x into the right outgoing buffers */
for (i=0; i<mat->sendlen; i++)
mat->sendbuf[i] = x[mat->sendind[i]];
MPI_Startall(mat->num_recv, mat->recv_req);
MPI_Startall(mat->num_send, mat->send_req);
/* Copy local part of x into top part of recvbuf */
for (i=0; i<num_local; i++)
mat->recvbuf[i] = x[i];
MPI_Waitall(mat->num_recv, mat->recv_req, mat->statuses);
/* do the multiply */
#ifdef HYPRE_USING_OPENMP
#pragma omp parallel for private(row,len,ind,val,temp,i) schedule(static)
#endif
for (row=0; row<=mat->end_row - mat->beg_row; row++)
{
MatrixGetRow(mat, row, &len, &ind, &val);
temp = 0.0;
for (i=0; i<len; i++)
{
temp = temp + val[i] * mat->recvbuf[ind[i]];
}
y[row] = temp;
}
MPI_Waitall(mat->num_send, mat->send_req, mat->statuses);
}
void MatrixMatvecSerial(Matrix *mat, double *x, double *y)
{
int row, i, len, *ind;
double *val, temp;
int num_local = mat->end_row - mat->beg_row + 1;
/* Set up persistent communications */
/* Assumes MatrixComplete has been called */
/* Put components of x into the right outgoing buffers */
for (i=0; i<mat->sendlen; i++)
mat->sendbuf[i] = x[mat->sendind[i]];
MPI_Startall(mat->num_recv, mat->recv_req);
MPI_Startall(mat->num_send, mat->send_req);
/* Copy local part of x into top part of recvbuf */
for (i=0; i<num_local; i++)
mat->recvbuf[i] = x[i];
MPI_Waitall(mat->num_recv, mat->recv_req, mat->statuses);
/* do the multiply */
for (row=0; row<=mat->end_row - mat->beg_row; row++)
{
MatrixGetRow(mat, row, &len, &ind, &val);
temp = 0.0;
for (i=0; i<len; i++)
{
temp = temp + val[i] * mat->recvbuf[ind[i]];
}
y[row] = temp;
}
MPI_Waitall(mat->num_send, mat->send_req, mat->statuses);
}
/*--------------------------------------------------------------------------
* MatrixMatvecTrans
* Can be done in place.
*--------------------------------------------------------------------------*/
void MatrixMatvecTrans(Matrix *mat, double *x, double *y)
{
int row, i, len, *ind;
double *val;
int num_local = mat->end_row - mat->beg_row + 1;
/* Set up persistent communications */
/* Assumes MatrixComplete has been called */
/* Post receives for local parts of the solution y */
MPI_Startall(mat->num_send, mat->recv_req2);
/* initialize accumulator buffer to zero */
for (i=0; i<mat->recvlen+num_local; i++)
mat->recvbuf[i] = 0.0;
/* do the multiply */
for (row=0; row<=mat->end_row - mat->beg_row; row++)
{
MatrixGetRow(mat, row, &len, &ind, &val);
for (i=0; i<len; i++)
{
mat->recvbuf[ind[i]] += val[i] * x[row];
}
}
/* Now can send nonlocal parts of solution to other procs */
MPI_Startall(mat->num_recv, mat->send_req2);
/* copy local part of solution into y */
for (i=0; i<num_local; i++)
y[i] = mat->recvbuf[i];
/* alternatively, loop over a wait any */
MPI_Waitall(mat->num_send, mat->recv_req2, mat->statuses);
/* add all the incoming partial sums to y */
for (i=0; i<mat->sendlen; i++)
y[mat->sendind[i]] += mat->sendbuf[i];
MPI_Waitall(mat->num_recv, mat->send_req2, mat->statuses);
}