2 stage gs (#314)

This PR (by @PaulMullowney #282) includes updates/changes to the 2 stage GS preconditioner. Co-authored-by: Paul Mullowney <pmullown@nrel.gov>
2021-03-24 15:14:00 -07:00 · 2021-03-24 15:14:00 -07:00 · 366b80f89b
commit 366b80f89b
parent 8002200aa0
8 changed files with 115 additions and 93 deletions
--- a/src/parcsr_ls/_hypre_parcsr_ls.h
+++ b/src/parcsr_ls/_hypre_parcsr_ls.h
@ -1963,6 +1963,8 @@ HYPRE_Int hypre_BoomerAMGRelaxKaczmarz( hypre_ParCSRMatrix *A, hypre_ParVector *

 HYPRE_Int hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *r, hypre_ParVector *z, HYPRE_Int choice);

+HYPRE_Int hypre_BoomerAMGRelaxTwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp, HYPRE_Int num_inner_iters );
+
 HYPRE_Int hypre_BoomerAMGRelax11TwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp );

 HYPRE_Int hypre_BoomerAMGRelax12TwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp );
--- a/src/parcsr_ls/par_cycle.c
+++ b/src/parcsr_ls/par_cycle.c
@ -326,7 +326,7 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
         /* If no coarsening occurred, apply a simple smoother once */
         Aux_U = U_array[level];
         Aux_F = F_array[level];
-         num_sweep = 1;
+         num_sweep = num_grid_sweeps[0];
         /* TK: Use the user relax type (instead of 0) to allow for setting a
           convergent smoother (e.g. in the solution of singular problems). */
         relax_type = hypre_ParAMGDataUserRelaxType(amg_data);
--- a/src/parcsr_ls/par_relax.c
+++ b/src/parcsr_ls/par_relax.c
@ -55,8 +55,9 @@ hypre_BoomerAMGRelax( hypre_ParCSRMatrix *A,
    *                        triangular)
    *     relax_type = 11 -> Two Stage approximation to GS. Uses the strict lower
    *                        part of the diagonal matrix
-    *     relax_type = 12 -> Two Stage approximation to GS. Uses the full diagonal
-    *                        matrix
+    *     relax_type = 12 -> Two Stage approximation to GS. Uses the strict lower
+    *                        part of the diagonal matrix and a second iteration
+    *                        for additional error approximation
    *     relax_type = 13 -> hybrid L1 Gauss-Seidel forward solve
    *     relax_type = 14 -> hybrid L1 Gauss-Seidel backward solve
    *     relax_type = 15 -> CG
@ -1450,13 +1451,16 @@ hypre_BoomerAMGRelaxKaczmarz( hypre_ParCSRMatrix *A,
   return hypre_error_flag;
 }

+
+
 HYPRE_Int
-hypre_BoomerAMGRelax11TwoStageGaussSeidelHost( hypre_ParCSRMatrix *A,
-                                               hypre_ParVector    *f,
-                                               HYPRE_Real          relax_weight,
-                                               HYPRE_Real          omega,
-                                               hypre_ParVector    *u,
-                                               hypre_ParVector    *Vtemp )
+hypre_BoomerAMGRelaxTwoStageGaussSeidelHost( hypre_ParCSRMatrix *A,
+                                             hypre_ParVector    *f,
+                                             HYPRE_Real          relax_weight,
+                                             HYPRE_Real          omega,
+                                             hypre_ParVector    *u,
+                                             hypre_ParVector    *Vtemp,
+                                             HYPRE_Int           num_inner_iters)
 {
   hypre_CSRMatrix *A_diag      = hypre_ParCSRMatrixDiag(A);
   HYPRE_Int        num_rows    = hypre_CSRMatrixNumRows(A_diag);
@ -1467,9 +1471,8 @@ hypre_BoomerAMGRelax11TwoStageGaussSeidelHost( hypre_ParCSRMatrix *A,
   HYPRE_Complex   *Vtemp_data  = hypre_VectorData(Vtemp_local);
   hypre_Vector    *u_local     = hypre_ParVectorLocalVector(u);
   HYPRE_Complex   *u_data      = hypre_VectorData(u_local);
-   HYPRE_Int        i, jj, ii;
-
-   hypre_ParCSRMatrixMatvecOutOfPlace(-relax_weight, A, u, relax_weight, f, Vtemp);
+   HYPRE_Int        i, k, jj, ii;
+   HYPRE_Complex    multiplier  = 1.0;

   /* Need to check that EVERY diagonal is nonzero first. If any are, throw exception */
   for (i = 0; i < num_rows; i++)
@ -1480,18 +1483,45 @@ hypre_BoomerAMGRelax11TwoStageGaussSeidelHost( hypre_ParCSRMatrix *A,
      }
   }

+   hypre_ParCSRMatrixMatvecOutOfPlace(-relax_weight, A, u, relax_weight, f, Vtemp);
+
+
   for (i = 0; i < num_rows; i++) /* Run the smoother */
   {
-      HYPRE_Complex res = 0.0;
-      for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+      // V = V/D
+      Vtemp_data[i] /= A_diag_data[A_diag_i[i]];
+
+      // u = u + m*v
+      u_data[i] += multiplier * Vtemp_data[i];
+   }
+
+   // adjust for the alternating series
+   multiplier *= -1.0;
+
+   for (k = 0; k < num_inner_iters; ++k)
+   {
+      // By going from bottom to top, we can update Vtemp in place because
+      // we're operating with the strict, lower triangular matrix
+      for (i = num_rows-1; i >=0; i--) /* Run the smoother */
      {
-         ii = A_diag_j[jj];
-         if (ii < i)
+         // spmv for the row first
+         HYPRE_Complex res = 0.0;
+         for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
         {
-            res -= (A_diag_data[jj] / A_diag_data[A_diag_i[ii]]) * Vtemp_data[ii];
+            ii = A_diag_j[jj];
+            if (ii < i)
+            {
+               res += A_diag_data[jj] * Vtemp_data[ii];
+            }
         }
+         // diagonal scaling has to come after the spmv accumulation. It's a row scaling
+         // not column
+         Vtemp_data[i] = res / A_diag_data[A_diag_i[i]];
+         u_data[i] += multiplier * Vtemp_data[i];
      }
-      u_data[i] += (Vtemp_data[i] + omega*res) / A_diag_data[A_diag_i[i]];
+
+      // adjust for the alternating series
+      multiplier *= -1.0;
   }

   return hypre_error_flag;
@ -1520,51 +1550,7 @@ hypre_BoomerAMGRelax11TwoStageGaussSeidel( hypre_ParCSRMatrix *A,
   else
 #endif
   {
-      hypre_BoomerAMGRelax11TwoStageGaussSeidelHost(A, f, relax_weight, omega, u, Vtemp);
-   }
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-hypre_BoomerAMGRelax12TwoStageGaussSeidelHost( hypre_ParCSRMatrix *A,
-                                               hypre_ParVector    *f,
-                                               HYPRE_Real          relax_weight,
-                                               HYPRE_Real          omega,
-                                               hypre_ParVector    *u,
-                                               hypre_ParVector    *Vtemp )
-{
-   hypre_CSRMatrix *A_diag      = hypre_ParCSRMatrixDiag(A);
-   HYPRE_Int        num_rows    = hypre_CSRMatrixNumRows(A_diag);
-   HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
-   HYPRE_Int       *A_diag_i    = hypre_CSRMatrixI(A_diag);
-   HYPRE_Int       *A_diag_j    = hypre_CSRMatrixJ(A_diag);
-   hypre_Vector    *Vtemp_local = hypre_ParVectorLocalVector(Vtemp);
-   HYPRE_Complex   *Vtemp_data  = hypre_VectorData(Vtemp_local);
-   hypre_Vector    *u_local     = hypre_ParVectorLocalVector(u);
-   HYPRE_Complex   *u_data      = hypre_VectorData(u_local);
-   HYPRE_Int        i, jj, ii;
-
-   hypre_ParCSRMatrixMatvecOutOfPlace(-relax_weight, A, u, relax_weight, f, Vtemp);
-
-   /* Need to check that EVERY diagonal is nonzero first. If any are, throw exception */
-   for (i = 0; i < num_rows; i++)
-   {
-      if (A_diag_data[A_diag_i[i]] == 0.0)
-      {
-         hypre_error_in_arg(1);
-      }
-   }
-
-   for (i = 0; i < num_rows; i++) /* Run the smoother */
-   {
-      HYPRE_Complex res = Vtemp_data[i];
-      for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
-      {
-         ii = A_diag_j[jj];
-         res -= (A_diag_data[jj] / A_diag_data[A_diag_i[ii]]) * Vtemp_data[ii];
-      }
-      u_data[i] += (Vtemp_data[i] + omega*res) / A_diag_data[A_diag_i[i]];
+      hypre_BoomerAMGRelaxTwoStageGaussSeidelHost(A, f, relax_weight, omega, u, Vtemp, 1);
   }

   return hypre_error_flag;
@ -1589,12 +1575,12 @@ hypre_BoomerAMGRelax12TwoStageGaussSeidel( hypre_ParCSRMatrix *A,

   if (exec == HYPRE_EXEC_DEVICE)
   {
-      hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice(A, f, relax_weight, omega, u, Vtemp, Ztemp, 0);
+      hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice(A, f, relax_weight, omega, u, Vtemp, Ztemp, 2);
   }
   else
 #endif
   {
-      hypre_BoomerAMGRelax12TwoStageGaussSeidelHost(A, f, relax_weight, omega, u, Vtemp);
+     hypre_BoomerAMGRelaxTwoStageGaussSeidelHost(A, f, relax_weight, omega, u, Vtemp, 2);
   }

   return hypre_error_flag;
--- a/src/parcsr_ls/par_relax_device.c
+++ b/src/parcsr_ls/par_relax_device.c
@ -74,32 +74,6 @@ hypre_BoomerAMGRelaxHybridGaussSeidelDevice( hypre_ParCSRMatrix *A,
   return hypre_error_flag;
 }

-/* option 0: inout = inout + D^{-1}*[(1+w)*r - w*A*x]
- * option 1: inout = inout + D^{-1}*[r - tril(A,-1)*x]
- * Note: r is modified */
-void
-hypre_TwoStageGaussSeidelMatvec(hypre_CSRMatrix *A,
-                                hypre_Vector    *x,
-                                hypre_Vector    *r,
-                                HYPRE_Complex    omega,
-                                hypre_Vector    *inout,
-                                HYPRE_Int        option)
-{
-   if (option == 0)
-   {
-      /* spmv with the full matrix */
-      hypre_CSRMatrixMatvecDevice(0.0, -omega, A, x, 1.0 + omega, r, r, 0.0);
-   }
-   else
-   {
-      /* spmv with L */
-      hypre_CSRMatrixSpMVDevice(-1.0, A, x, 1.0, r, -2);
-   }
-
-   hypreDevice_DiagScaleVector(hypre_CSRMatrixNumRows(A), hypre_CSRMatrixI(A), hypre_CSRMatrixData(A),
-                               hypre_VectorData(r), 1.0, hypre_VectorData(inout));
-}
-
 HYPRE_Int
 hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A,
                                                hypre_ParVector    *f,
@ -108,7 +82,7 @@ hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A,
                                                hypre_ParVector    *u,
                                                hypre_ParVector    *r,
                                                hypre_ParVector    *z,
-                                                HYPRE_Int           choice)
+                                                HYPRE_Int           num_inner_iters)
 {
   hypre_NvtxPushRange("BoomerAMGRelaxTwoStageGaussSeidelDevice");

@ -119,14 +93,36 @@ hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A,
   hypre_Vector    *u_local      = hypre_ParVectorLocalVector(u);
   hypre_Vector    *r_local      = hypre_ParVectorLocalVector(r);
   hypre_Vector    *z_local      = hypre_ParVectorLocalVector(z);
+   HYPRE_Complex   *u_data       = hypre_VectorData(u_local);
   HYPRE_Complex   *r_data       = hypre_VectorData(r_local);
   HYPRE_Complex   *z_data       = hypre_VectorData(z_local);
+   HYPRE_Int        zsize        = hypre_VectorSize(z_local);
+   HYPRE_Int        rsize        = hypre_VectorSize(r_local);
+   HYPRE_Complex    multiplier   = 1.0;
+   HYPRE_Int        i;

   hypre_ParCSRMatrixMatvecOutOfPlace(-relax_weight, A, u, relax_weight, f, r);

   hypreDevice_DiagScaleVector(num_rows, A_diag_i, A_diag_data, r_data, 0.0, z_data);

-   hypre_TwoStageGaussSeidelMatvec(A_diag, z_local, r_local, omega, u_local, choice);
+   // set this so that axpy works out properly. Reset later.
+   hypre_VectorSize(z_local) = rsize;
+
+   // 1) u = u + z
+   hypre_SeqVectorAxpy(multiplier, z_local, u_local);
+   multiplier *= -1.0;
+
+   for (i = 0; i < num_inner_iters; ++i) 
+   {
+       // 2) r = Lz
+       hypre_CSRMatrixSpMVDevice(1.0, A_diag, z_local, 0.0, r_local, -2);
+       // 3) z = r/D, u = u + m*z
+       hypreDevice_DiagScaleVector2(num_rows, A_diag_i, A_diag_data, r_data, multiplier, z_data, u_data);
+       multiplier *= -1.0;
+   }
+
+   // reset this
+   hypre_VectorSize(z_local) = zsize;

   hypre_NvtxPopRange();

@ -134,3 +130,4 @@ hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A,
 }

 #endif /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
+
--- a/src/parcsr_ls/protos.h
+++ b/src/parcsr_ls/protos.h
@ -1176,6 +1176,8 @@ HYPRE_Int hypre_BoomerAMGRelaxKaczmarz( hypre_ParCSRMatrix *A, hypre_ParVector *

 HYPRE_Int hypre_BoomerAMGRelaxTwoStageGaussSeidelDevice ( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *r, hypre_ParVector *z, HYPRE_Int choice);

+HYPRE_Int hypre_BoomerAMGRelaxTwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp, HYPRE_Int num_inner_iters );
+
 HYPRE_Int hypre_BoomerAMGRelax11TwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp );

 HYPRE_Int hypre_BoomerAMGRelax12TwoStageGaussSeidel( hypre_ParCSRMatrix *A, hypre_ParVector *f, HYPRE_Int *cf_marker, HYPRE_Int relax_points, HYPRE_Real relax_weight, HYPRE_Real omega, hypre_ParVector *u, hypre_ParVector *Vtemp, hypre_ParVector *Ztemp );
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@ -1605,6 +1605,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
 HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
+HYPRE_Int hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z);
 HYPRE_Int hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y);
 HYPRE_Int hypreDevice_MaskedIVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *mask);
 HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v);
--- a/src/utilities/hypre_cuda_utils.c
+++ b/src/utilities/hypre_cuda_utils.c
@ -614,6 +614,39 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
   return hypre_error_flag;
 }

+__global__ void
+hypreCUDAKernel_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
+{
+   HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+
+   if (i < n)
+   {
+      HYPRE_Complex t = x[i] / A_data[A_i[i]];
+      y[i] = t;
+      z[i] += beta*t;
+   }
+}
+
+/* y = diag(A) \ x
+ * z = beta * (diag(A) \ x) + z
+ * Note: Assume A_i[i] points to the ith diagonal entry of A */
+HYPRE_Int
+hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
+{
+   /* trivial case */
+   if (n <= 0)
+   {
+      return hypre_error_flag;
+   }
+
+   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
+   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+
+   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
+
+   return hypre_error_flag;
+}
+
 __global__ void
 hypreCUDAKernel_BigToSmallCopy(      HYPRE_Int*    __restrict__ tgt,
                               const HYPRE_BigInt* __restrict__ src,
--- a/src/utilities/protos.h
+++ b/src/utilities/protos.h
@ -249,6 +249,7 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
 HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
+HYPRE_Int hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z);
 HYPRE_Int hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y);
 HYPRE_Int hypreDevice_MaskedIVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *mask);
 HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v);