Fix MGR setup on GPU and MGR bugfixes. (#260)

This PR addresses initial changes necessary to perform MGR setup on GPUs. It also fixes a bug in the iterative use of AMG for F-relaxation, adds options to print statistics of the F-relaxation solver (for the V-cycle smoother), and places the diagonal of a transposed square matrix as the first element in the row. This last change required a couple minor changes to the saved files.
2021-05-10 13:19:34 -07:00 · 2021-05-10 13:19:34 -07:00 · c272f6980c
commit c272f6980c
parent 3f12d47651
12 changed files with 219 additions and 88 deletions
--- a/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/src/parcsr_ls/HYPRE_parcsr_ls.h
@ -3784,10 +3784,12 @@ HYPRE_MGRSetLevelFRelaxNumFunctions(HYPRE_Solver solver, HYPRE_Int *num_function
 *    - 0    : injection \f$[0  I]\f$
 *    - 1    : unscaled (not recommended)
 *    - 2    : diagonal scaling (Jacobi)
+ *    - 3    : approximate inverse
+ *    - 4    : pAIR distance 1
+ *    - 5    : pAIR distance 2
 *    - else : use classical modified interpolation
 *
- * These options are currently active for the last stage reduction. Intermediate
- * reduction levels use injection. The default is injection.
+ * The default is injection.
 **/
 HYPRE_Int
 HYPRE_MGRSetRestrictType( HYPRE_Solver solver,
@ -3806,16 +3808,17 @@ HYPRE_MGRSetNumRestrictSweeps( HYPRE_Solver solver,
                               HYPRE_Int nsweeps );

 /**
- * (Optional) Set the strategy for computing the MGR restriction operator.
+ * (Optional) Set the strategy for computing the MGR interpolation operator.
 * Options for \e interp_type are:
 *
 *    - 0    : injection \f$[0  I]^{T}\f$
 *    - 1    : unscaled (not recommended)
 *    - 2    : diagonal scaling (Jacobi)
- *    - else : use default (classical modified interpolation)
+ *    - 3    : classical modified interpolation
+ *    - 4    : approximate inverse
+ *    - else : classical modified interpolation
 *
- * These options are currently active for the last stage reduction. Intermediate
- * reduction levels use diagonal scaling.
+ * The default is diagonal scaling.
 **/
 HYPRE_Int
 HYPRE_MGRSetInterpType( HYPRE_Solver solver,
@ -3879,6 +3882,14 @@ HYPRE_Int
 HYPRE_MGRSetPrintLevel( HYPRE_Solver solver,
                        HYPRE_Int print_level );

+HYPRE_Int
+HYPRE_MGRSetFrelaxPrintLevel( HYPRE_Solver solver,
+                        HYPRE_Int print_level );
+                        
+HYPRE_Int
+HYPRE_MGRSetCoarseGridPrintLevel( HYPRE_Solver solver,
+                        HYPRE_Int print_level );                        
+
 /**
 * (Optional) Set the threshold to compress the coarse grid at each level
 * Use threshold = 0.0 if no truncation is applied. Otherwise, set the threshold
--- a/src/parcsr_ls/HYPRE_parcsr_mgr.c
+++ b/src/parcsr_ls/HYPRE_parcsr_mgr.c
@ -358,6 +358,24 @@ HYPRE_MGRSetTruncateCoarseGridThreshold( HYPRE_Solver solver, HYPRE_Real thresho
   return hypre_MGRSetTruncateCoarseGridThreshold( solver, threshold );
 }

+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetFrelaxPrintLevel
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetFrelaxPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+{
+   return hypre_MGRSetFrelaxPrintLevel( solver, print_level );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetCoarseGridPrintLevel
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetCoarseGridPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+{
+   return hypre_MGRSetCoarseGridPrintLevel( solver, print_level );
+}
+
 /*--------------------------------------------------------------------------
 * HYPRE_MGRSetPrintLevel
 *--------------------------------------------------------------------------*/
--- a/src/parcsr_ls/_hypre_parcsr_ls.h
+++ b/src/parcsr_ls/_hypre_parcsr_ls.h
@ -2190,6 +2190,8 @@ HYPRE_Int hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetNumRestrictSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
+HYPRE_Int hypre_MGRSetFrelaxPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
+HYPRE_Int hypre_MGRSetCoarseGridPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetTruncateCoarseGridThreshold( void *mgr_vdata, HYPRE_Real threshold);
 HYPRE_Int hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging );
 HYPRE_Int hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter );
--- a/src/parcsr_ls/par_mgr.c
+++ b/src/parcsr_ls/par_mgr.c
@ -86,6 +86,8 @@ hypre_MGRCreate()

  (mgr_data -> logging) = 0;
  (mgr_data -> print_level) = 0;
+  (mgr_data -> frelax_print_level) = 0;
+  (mgr_data -> cg_print_level) = 0;

  (mgr_data -> l1_norms) = NULL;

@ -1997,6 +1999,7 @@ hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix    *A,
  HYPRE_Int my_id;
  MPI_Comm comm = hypre_ParCSRMatrixComm(A);
  hypre_MPI_Comm_rank(comm,&my_id);
+  HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);

  n_local_fine_grid = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
  c_marker = hypre_CTAlloc(HYPRE_Int, n_local_fine_grid, HYPRE_MEMORY_HOST);
@ -2131,19 +2134,18 @@ hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix    *A,
  HYPRE_Int             *A_h_correction_offd_i = hypre_CSRMatrixI(A_h_correction_offd);
  HYPRE_Int             *A_h_correction_offd_j = hypre_CSRMatrixJ(A_h_correction_offd);

-  // Allow for maximum dropping with Pmax = 0
-  //if (Pmax > 0)
-  //{
+  if (Pmax > 0)
+  {
    if (ordering == 0) // interleaved ordering
    {
-      HYPRE_Int *A_h_correction_diag_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints+1, HYPRE_MEMORY_HOST);
-      HYPRE_Int *A_h_correction_diag_j_new = hypre_CTAlloc(HYPRE_Int, (bsize + max_elmts)*n_local_cpoints, HYPRE_MEMORY_HOST);
-      HYPRE_Complex *A_h_correction_diag_data_new = hypre_CTAlloc(HYPRE_Complex, (bsize + max_elmts)*n_local_cpoints, HYPRE_MEMORY_HOST);
+      HYPRE_Int *A_h_correction_diag_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints+1, memory_location);
+      HYPRE_Int *A_h_correction_diag_j_new = hypre_CTAlloc(HYPRE_Int, (bsize + max_elmts)*n_local_cpoints, memory_location);
+      HYPRE_Complex *A_h_correction_diag_data_new = hypre_CTAlloc(HYPRE_Complex, (bsize + max_elmts)*n_local_cpoints, memory_location);
      HYPRE_Int num_nonzeros_diag_new = 0;

-      HYPRE_Int *A_h_correction_offd_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints+1, HYPRE_MEMORY_HOST);
-      HYPRE_Int *A_h_correction_offd_j_new = hypre_CTAlloc(HYPRE_Int, max_elmts*n_local_cpoints, HYPRE_MEMORY_HOST);
-      HYPRE_Complex *A_h_correction_offd_data_new = hypre_CTAlloc(HYPRE_Complex, max_elmts*n_local_cpoints, HYPRE_MEMORY_HOST);
+      HYPRE_Int *A_h_correction_offd_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints+1, memory_location);
+      HYPRE_Int *A_h_correction_offd_j_new = hypre_CTAlloc(HYPRE_Int, max_elmts*n_local_cpoints, memory_location);
+      HYPRE_Complex *A_h_correction_offd_data_new = hypre_CTAlloc(HYPRE_Complex, max_elmts*n_local_cpoints, memory_location);
      HYPRE_Int num_nonzeros_offd_new = 0;


@ -2212,17 +2214,17 @@ hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix    *A,
        hypre_TFree(aux_data, HYPRE_MEMORY_HOST);
      }

-      hypre_TFree(A_h_correction_diag_i, HYPRE_MEMORY_HOST);
-      hypre_TFree(A_h_correction_diag_j, HYPRE_MEMORY_HOST);
-      hypre_TFree(A_h_correction_diag_data, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_h_correction_diag_i, memory_location);
+      hypre_TFree(A_h_correction_diag_j, memory_location);
+      hypre_TFree(A_h_correction_diag_data, memory_location);
      hypre_CSRMatrixI(A_h_correction_diag) = A_h_correction_diag_i_new;
      hypre_CSRMatrixJ(A_h_correction_diag) = A_h_correction_diag_j_new;
      hypre_CSRMatrixData(A_h_correction_diag) = A_h_correction_diag_data_new;
      hypre_CSRMatrixNumNonzeros(A_h_correction_diag) = num_nonzeros_diag_new;

-      if (A_h_correction_offd_i) hypre_TFree(A_h_correction_offd_i, HYPRE_MEMORY_HOST);
-      if (A_h_correction_offd_j) hypre_TFree(A_h_correction_offd_j, HYPRE_MEMORY_HOST);
-      if (A_h_correction_offd_data) hypre_TFree(A_h_correction_offd_data, HYPRE_MEMORY_HOST);
+      if (A_h_correction_offd_i) hypre_TFree(A_h_correction_offd_i, memory_location);
+      if (A_h_correction_offd_j) hypre_TFree(A_h_correction_offd_j, memory_location);
+      if (A_h_correction_offd_data) hypre_TFree(A_h_correction_offd_data, memory_location);
      hypre_CSRMatrixI(A_h_correction_offd) = A_h_correction_offd_i_new;
      hypre_CSRMatrixJ(A_h_correction_offd) = A_h_correction_offd_j_new;
      hypre_CSRMatrixData(A_h_correction_offd) = A_h_correction_offd_data_new;
@ -2230,10 +2232,10 @@ hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix    *A,
    }
    else
    {
-      hypre_printf("Error!! Block ordering is not supported at the moment\n");
+      hypre_printf("Error!! Block ordering for non-Galerkin coarse grid is not currently supported\n");
      exit(-1);
    }
-  //}
+  }
  //hypre_MGRParCSRMatrixTruncate(A_h_correction, max_elmts);
  //wall_time = time_getWallclockSeconds() - wall_time;
  //hypre_printf("Filter A_h_correction time: %1.5f\n", wall_time);
@ -2744,14 +2746,15 @@ hypre_MGRBuildInterpApproximateInverse(hypre_ParCSRMatrix   *A,

  // Get A_FF
  hypre_MGRGetSubBlock(A, F_marker, F_marker, 0, &A_ff);
+//  hypre_ParCSRMatrixPrintIJ(A_ff, 1, 1, "A_ff");
  // Get A_FC
  hypre_MGRGetSubBlock(A, F_marker, C_marker, 0, &A_fc);

  hypre_MGRApproximateInverse(A_ff, &A_ff_inv);
-  hypre_ParCSRMatrixPrintIJ(A_ff_inv, 1, 1, "A_ff_inv");
-  hypre_ParCSRMatrixPrintIJ(A_fc, 1, 1, "A_fc");
+//  hypre_ParCSRMatrixPrintIJ(A_ff_inv, 1, 1, "A_ff_inv");
+//  hypre_ParCSRMatrixPrintIJ(A_fc, 1, 1, "A_fc");
  minus_Wp = hypre_ParMatmul(A_ff_inv, A_fc);
-  hypre_ParCSRMatrixPrintIJ(minus_Wp, 1, 1, "Wp");
+//  hypre_ParCSRMatrixPrintIJ(minus_Wp, 1, 1, "Wp");

  hypre_CSRMatrix *minus_Wp_diag = hypre_ParCSRMatrixDiag(minus_Wp);
  HYPRE_Real      *minus_Wp_diag_data = hypre_CSRMatrixData(minus_Wp_diag);
@ -3117,19 +3120,27 @@ hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
         HYPRE_Int         interp_type,
                     HYPRE_Int             numsweeps)
 {
-  //HYPRE_Int i;
+//  HYPRE_Int i;
  hypre_ParCSRMatrix    *P_ptr = NULL;
-  //HYPRE_Real       jac_trunc_threshold = trunc_factor;
-  //HYPRE_Real       jac_trunc_threshold_minus = 0.5*jac_trunc_threshold;
+//  HYPRE_Real       jac_trunc_threshold = trunc_factor;
+//  HYPRE_Real       jac_trunc_threshold_minus = 0.5*jac_trunc_threshold;

  /* Interpolation for each level */
  if (interp_type <3)
  {
    hypre_MGRBuildP( A,CF_marker,num_cpts_global,interp_type,debug_flag,&P_ptr);
-    /* Could do a few sweeps of Jacobi to further improve P */
-    //for(i=0; i<numsweeps; i++)
-    //  hypre_BoomerAMGJacobiInterp(A, &P_ptr, S,1, NULL, CF_marker, 0, jac_trunc_threshold, jac_trunc_threshold_minus );
-    //hypre_BoomerAMGInterpTruncation(P_ptr, trunc_factor, max_elmts);
+    
+    /* Could do a few sweeps of Jacobi to further improve Jacobi interpolation P */
+/*
+    if(interp_type == 2)
+    {
+       for(i=0; i<numsweeps; i++)
+       {
+         hypre_BoomerAMGJacobiInterp(A, &P_ptr, S,1, NULL, CF_marker, 0, jac_trunc_threshold, jac_trunc_threshold_minus );
+       }
+       hypre_BoomerAMGInterpTruncation(P_ptr, trunc_factor, max_elmts);
+    }
+*/
  }
  else if (interp_type == 4)
  {
@ -3146,17 +3157,6 @@ hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
    /* Classical modified interpolation */
    hypre_BoomerAMGBuildInterp(A, CF_marker, S, num_cpts_global,1, NULL,debug_flag,
                      trunc_factor, max_elmts, &P_ptr);
-
-    /* Do k steps of Jacobi build W for P = [-W I].
-     * Note that BoomerAMGJacobiInterp assumes you have some initial P,
-     * hence we need to initialize P as above, before calling this routine.
-     * If numsweeps = 0, the following step is skipped and P is returned as is.
-     * Looping here is equivalent to improving P by Jacobi interpolation
-    */
-    //for(i=0; i<numsweeps; i++)
-    //  hypre_BoomerAMGJacobiInterp(A, &P_ptr, S,1, NULL, CF_marker,
-    //                                 0, jac_trunc_threshold,
-    //                                 jac_trunc_threshold_minus );
  }

  /* set pointer to P */
@ -3193,14 +3193,8 @@ hypre_MGRBuildRestrict(hypre_ParCSRMatrix     *A,
  {
    hypre_ParCSRMatrixTranspose(A, &AT, 1);
  }
-  if (restrict_type > 5)
-  {
-    /* Build new strength matrix */
-    hypre_BoomerAMGCreateS(AT, strong_threshold, max_row_sum, 1, NULL, &ST);
-    /* use appropriate communication package for Strength matrix */
-  }

-  /* Interpolation for each level */
+  /* Restriction for each level */
  if (restrict_type == 0)
  {
    hypre_MGRBuildP(A, CF_marker, num_cpts_global, restrict_type, debug_flag, &R_ptr);
@ -3208,31 +3202,20 @@ hypre_MGRBuildRestrict(hypre_ParCSRMatrix     *A,
  else if (restrict_type == 1 || restrict_type == 2)
  {
    hypre_MGRBuildP(AT, CF_marker, num_cpts_global, restrict_type, debug_flag, &R_ptr);
-    /* Could do a few sweeps of Jacobi to further improve P */
-    //for(i=0; i<numsweeps; i++)
-      //    hypre_BoomerAMGJacobiInterp(A, &R_ptr, S,1, NULL, CF_marker, 0, jac_trunc_threshold, jac_trunc_threshold_minus );
-    //hypre_BoomerAMGInterpTruncation(R_ptr, trunc_factor, max_elmts);
  }
-  else if (restrict_type == 4)
+  else if (restrict_type == 3)
  {
-    hypre_MGRBuildInterpApproximateInverse(A, CF_marker, num_cpts_global, debug_flag, &R_ptr);
+    hypre_MGRBuildInterpApproximateInverse(AT, CF_marker, num_cpts_global, debug_flag, &R_ptr);
    hypre_BoomerAMGInterpTruncation(R_ptr, trunc_factor, max_elmts);
  }
  else
  {
+    /* Build new strength matrix */
+    hypre_BoomerAMGCreateS(AT, strong_threshold, max_row_sum, 1, NULL, &ST);
+    
    /* Classical modified interpolation */
    hypre_BoomerAMGBuildInterp(AT, CF_marker, ST, num_cpts_global,1, NULL,debug_flag,
                               trunc_factor, max_elmts, &R_ptr);
-
-    /* Do k steps of Jacobi build W for P = [-W I].
-    * Note that BoomerAMGJacobiInterp assumes you have some initial P,
-    * hence we need to initialize P as above, before calling this routine.
-    * If numsweeps = 0, the following step is skipped and P is returned as is.
-    * Looping here is equivalent to improving P by Jacobi interpolation
-    */
-    // for(i=0; i<numsweeps; i++)
-    //   hypre_BoomerAMGJacobiInterp(A, &R_ptr, S,1, NULL, CF_marker, 0,
-    //                               jac_trunc_threshold, jac_trunc_threshold_minus);
  }

  /* set pointer to P */
@ -4492,6 +4475,24 @@ hypre_MGRSetTruncateCoarseGridThreshold( void *mgr_vdata, HYPRE_Real threshold)
  return hypre_error_flag;
 }

+/* Set print level for F-relaxation solver */
+HYPRE_Int
+hypre_MGRSetFrelaxPrintLevel( void *mgr_vdata, HYPRE_Int print_level )
+{
+  hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+  (mgr_data -> frelax_print_level) = print_level;  
+  return hypre_error_flag;
+}
+
+/* Set print level for coarse grid solver */
+HYPRE_Int
+hypre_MGRSetCoarseGridPrintLevel( void *mgr_vdata, HYPRE_Int print_level )
+{
+  hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+  (mgr_data -> cg_print_level) = print_level;  
+  return hypre_error_flag;
+}
+
 /* Set print level for mgr solver */
 HYPRE_Int
 hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level )
@ -4501,7 +4502,7 @@ hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level )
  return hypre_error_flag;
 }

-/* Set print level for mgr solver */
+/* Set logging level for mgr solver */
 HYPRE_Int
 hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging )
 {
@ -4615,6 +4616,7 @@ hypre_MGRGetSubBlock( hypre_ParCSRMatrix   *A,
  MPI_Comm        comm = hypre_ParCSRMatrixComm(A);
  hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
  hypre_ParCSRCommHandle  *comm_handle;
+  HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);

  hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
  HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
@ -4855,18 +4857,18 @@ hypre_MGRGetSubBlock( hypre_ParCSRMatrix   *A,

  Ablock_diag_size = jj_counter;

-  Ablock_diag_i    = hypre_CTAlloc(HYPRE_Int, ii_counter+1, HYPRE_MEMORY_HOST);
-  Ablock_diag_j    = hypre_CTAlloc(HYPRE_Int, Ablock_diag_size, HYPRE_MEMORY_HOST);
-  Ablock_diag_data = hypre_CTAlloc(HYPRE_Real, Ablock_diag_size, HYPRE_MEMORY_HOST);
+  Ablock_diag_i    = hypre_CTAlloc(HYPRE_Int, ii_counter+1, memory_location);
+  Ablock_diag_j    = hypre_CTAlloc(HYPRE_Int, Ablock_diag_size, memory_location);
+  Ablock_diag_data = hypre_CTAlloc(HYPRE_Real, Ablock_diag_size, memory_location);

  Ablock_diag_i[ii_counter] = jj_counter;


  Ablock_offd_size = jj_counter_offd;

-  Ablock_offd_i    = hypre_CTAlloc(HYPRE_Int, ii_counter+1, HYPRE_MEMORY_HOST);
-  Ablock_offd_j    = hypre_CTAlloc(HYPRE_Int, Ablock_offd_size, HYPRE_MEMORY_HOST);
-  Ablock_offd_data = hypre_CTAlloc(HYPRE_Real, Ablock_offd_size, HYPRE_MEMORY_HOST);
+  Ablock_offd_i    = hypre_CTAlloc(HYPRE_Int, ii_counter+1, memory_location);
+  Ablock_offd_j    = hypre_CTAlloc(HYPRE_Int, Ablock_offd_size, memory_location);
+  Ablock_offd_data = hypre_CTAlloc(HYPRE_Real, Ablock_offd_size, memory_location);

  /*-----------------------------------------------------------------------
   *  Intialize some stuff.
@ -5020,7 +5022,7 @@ hypre_MGRGetSubBlock( hypre_ParCSRMatrix   *A,
      }
    }

-    col_map_offd_Ablock = hypre_CTAlloc(HYPRE_BigInt, num_cols_Ablock_offd, HYPRE_MEMORY_HOST);
+    col_map_offd_Ablock = hypre_CTAlloc(HYPRE_BigInt, num_cols_Ablock_offd, memory_location);
    tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_Ablock_offd, HYPRE_MEMORY_HOST);
    index = 0;
    for (i=0; i < num_cols_Ablock_offd; i++)
--- a/src/parcsr_ls/par_mgr.h
+++ b/src/parcsr_ls/par_mgr.h
@ -62,6 +62,8 @@ typedef struct
  HYPRE_Int     relax_type;
  HYPRE_Int     logging;
  HYPRE_Int     print_level;
+  HYPRE_Int     frelax_print_level;
+  HYPRE_Int     cg_print_level;
  HYPRE_Int     max_iter;
  HYPRE_Int     relax_order;
  HYPRE_Int     num_relax_sweeps;
--- a/src/parcsr_ls/par_mgr_setup.c
+++ b/src/parcsr_ls/par_mgr_setup.c
@ -107,7 +107,7 @@ hypre_MGRSetup( void               *mgr_vdata,
  hypre_ParCSRMatrix *A_ff_inv = (mgr_data -> A_ff_inv);

  HYPRE_Int use_air = 0;
-  HYPRE_Int truncate_cg_threshold = (mgr_data -> truncate_coarse_grid_threshold);
+  HYPRE_Real truncate_cg_threshold = (mgr_data -> truncate_coarse_grid_threshold);
 //  HYPRE_Real wall_time;

  /* ----- begin -----*/
@ -847,7 +847,7 @@ hypre_MGRSetup( void               *mgr_vdata,
      {
        HYPRE_Int block_num_f_points = (lev == 0 ? block_size : block_num_coarse_indexes[lev-1]) - block_num_coarse_indexes[lev];
        hypre_MGRComputeNonGalerkinCoarseGrid(A_array[lev], P, RT, block_num_f_points,
-          /* ordering */0, /* method */ 0, max_elmts, /* keep_stencil */ 0, CF_marker_array[lev], &RAP_ptr);
+          /* ordering */set_c_points_method, /* method (approx. inverse or not) */ 0, max_elmts, /* keep_stencil */ 0, CF_marker_array[lev], &RAP_ptr);
        hypre_ParCSRMatrixOwnsColStarts(RAP_ptr) = 0;
        hypre_ParCSRMatrixOwnsColStarts(P_array[lev]) = 0;
        hypre_ParCSRMatrixOwnsRowStarts(RT) = 0;
@ -914,12 +914,12 @@ hypre_MGRSetup( void               *mgr_vdata,
        A_ff_array[lev] = A_ff_ptr;

        aff_solver[lev] = (HYPRE_Solver*) hypre_BoomerAMGCreate();
-        hypre_BoomerAMGSetMaxIter(aff_solver[lev], 1);
+        hypre_BoomerAMGSetMaxIter(aff_solver[lev], mgr_data -> num_relax_sweeps);
        hypre_BoomerAMGSetTol(aff_solver[lev], 0.0);
        hypre_BoomerAMGSetRelaxOrder(aff_solver[lev], 1);
        //hypre_BoomerAMGSetAggNumLevels(aff_solver[lev], 1);
        hypre_BoomerAMGSetNumSweeps(aff_solver[lev], 3);
-        hypre_BoomerAMGSetPrintLevel(aff_solver[lev], print_level);
+        hypre_BoomerAMGSetPrintLevel(aff_solver[lev], mgr_data -> frelax_print_level);
        hypre_BoomerAMGSetNumFunctions(aff_solver[lev], 1);

        fine_grid_solver_setup(aff_solver[lev], A_ff_ptr, F_fine_array[lev+1], U_fine_array[lev+1]);
@ -1053,7 +1053,7 @@ hypre_MGRSetup( void               *mgr_vdata,
    hypre_BoomerAMGSetMaxIter ( default_cg_solver, 1 );
    hypre_BoomerAMGSetTol ( default_cg_solver, 0.0 );
    hypre_BoomerAMGSetRelaxOrder( default_cg_solver, 1);
-    hypre_BoomerAMGSetPrintLevel(default_cg_solver, print_level);
+    hypre_BoomerAMGSetPrintLevel(default_cg_solver, mgr_data -> cg_print_level);
    /* set setup and solve functions */
    coarse_grid_solver_setup =  (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSetup;
    coarse_grid_solver_solve =  (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSolve;
--- a/src/parcsr_ls/par_mgr_solve.c
+++ b/src/parcsr_ls/par_mgr_solve.c
@ -579,6 +579,8 @@ hypre_MGRCycle( void               *mgr_vdata,
   HYPRE_Int       fine_grid;
   HYPRE_Int       Not_Finished;
   HYPRE_Int       cycle_type;
+   HYPRE_Int            print_level = (mgr_data -> print_level);   
+   HYPRE_Int            frelax_print_level = (mgr_data -> frelax_print_level);   

   hypre_ParCSRMatrix   **A_array = (mgr_data -> A_array);
   hypre_ParCSRMatrix   **RT_array  = (mgr_data -> RT_array);
@ -596,6 +598,7 @@ hypre_MGRCycle( void               *mgr_vdata,
   hypre_Vector       **relax_l1_norms = (mgr_data -> l1_norms);
   hypre_ParVector      *Vtemp = (mgr_data -> Vtemp);
   hypre_ParVector      *Ztemp = (mgr_data -> Ztemp);
+//   hypre_ParVector      *Utemp = (mgr_data -> Utemp);

   hypre_ParVector      **U_fine_array = (mgr_data -> U_fine_array);
   hypre_ParVector      **F_fine_array = (mgr_data -> F_fine_array);
@ -643,7 +646,7 @@ hypre_MGRCycle( void               *mgr_vdata,
            HYPRE_Real convergence_factor_cg;
            hypre_BoomerAMGGetRelResidualNorm(cg_solver, &convergence_factor_cg);
            (mgr_data -> cg_convergence_factor) = convergence_factor_cg;
-            if ((mgr_data -> print_level) > 1 && my_id == 0 && convergence_factor_cg > 1.0)
+            if ((print_level) > 1 && my_id == 0 && convergence_factor_cg > 1.0)
            {
               hypre_printf("Warning!!! Coarse grid solve diverges. Factor = %1.2e\n", convergence_factor_cg);
            }
@ -715,11 +718,77 @@ hypre_MGRCycle( void               *mgr_vdata,
            //                                    U_array[level], 1.0, F_array[level], Vtemp);
            //  convergence_factor_frelax = hypre_ParVectorInnerProd(Vtemp, Vtemp);

+            HYPRE_Real resnorm, init_resnorm;
+            HYPRE_Real rhs_norm, old_resnorm;
+            HYPRE_Real rel_resnorm = 1.0;
+            HYPRE_Real conv_factor = 1.0;
+            if (frelax_print_level > 1)
+            {                                
+                hypre_ParCSRMatrixMatvecOutOfPlace(-1.0, A_array[level],
+                                                U_array[level], 1.0, F_array[level], Vtemp);
+                                                
+                resnorm = hypre_ParVectorInnerProd(Vtemp, Vtemp);                                
+                init_resnorm = resnorm;
+                rhs_norm = sqrt(hypre_ParVectorInnerProd(F_array[level], F_array[level]));
+                
+                if (rhs_norm > HYPRE_REAL_EPSILON)
+                {
+                   rel_resnorm = init_resnorm / rhs_norm;
+                }                                      
+                else
+                {
+                   /* rhs is zero, return a zero solution */
+                   hypre_ParVectorSetConstantValues(U_array[0], 0.0);
+
+                   HYPRE_ANNOTATE_FUNC_END;
+
+                   return hypre_error_flag;
+                }
+                if(my_id == 0)
+                {                                                
+                   hypre_printf("\nBegin F-relaxation: V-Cycle Smoother \n");
+                   hypre_printf("                                            relative\n");
+                   hypre_printf("               residual        factor       residual\n");
+                   hypre_printf("               --------        ------       --------\n");
+                   hypre_printf("    Initial    %e                 %e\n",init_resnorm,
+                                     rel_resnorm);
+               }
+            }
+
            for(i=0; i<nsweeps; i++)
            {
               hypre_MGRFrelaxVcycle(FrelaxVcycleData[level], F_array[level], U_array[level]);
-            }
+               
+               if(frelax_print_level > 1)
+               {
+                  old_resnorm = resnorm;
+                  hypre_ParCSRMatrixMatvecOutOfPlace(-1.0, A_array[level],
+                                                   U_array[level], 1.0, F_array[level], Vtemp);
+                  resnorm = hypre_ParVectorInnerProd(Vtemp, Vtemp);                                

+                  if (old_resnorm) conv_factor = resnorm / old_resnorm;
+                  else conv_factor = resnorm;               
+                
+                  if (rhs_norm > HYPRE_REAL_EPSILON)
+                  {
+                     rel_resnorm = resnorm / rhs_norm;
+                  }
+                  else
+                  {
+                     rel_resnorm = resnorm;
+                  }   
+
+                  if (my_id == 0 )
+                  {
+                      hypre_printf("\n    V-Cycle %2d   %e    %f     %e \n", i,
+                      resnorm, conv_factor, rel_resnorm);
+                  }
+               }
+            }
+            if (my_id == 0 && frelax_print_level > 1)
+            {     
+                hypre_printf("End F-relaxation: V-Cycle Smoother \n\n");
+            }
            // compute residual after solve
            //hypre_ParCSRMatrixMatvecOutOfPlace(-1.0, A_array[level],
            //                                  U_array[level], 1.0, F_array[level], Vtemp);
@ -728,11 +797,22 @@ hypre_MGRCycle( void               *mgr_vdata,
         }
         else if (Frelax_method[level] == 2)
         {
-            hypre_ParVectorSetConstantValues(F_fine_array[coarse_grid], 0.0);
-            hypre_MGRAddVectorR(CF_marker[fine_grid], FMRK, 1.0, F_array[fine_grid], 0.0, &(F_fine_array[coarse_grid]));
-            hypre_ParVectorSetConstantValues(U_fine_array[coarse_grid], 0.0);
+                // We need to first compute residual to ensure that 
+                // F-relaxation is reducing the global residual
+
+                alpha = -1.0;
+                beta = 1.0;
+               hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A_array[fine_grid], U_array[fine_grid], beta, F_array[fine_grid], Vtemp);
+               
+            // restrict to F points
+//            hypre_ParVectorSetConstantValues(F_fine_array[coarse_grid], 0.0);
+            hypre_MGRAddVectorR(CF_marker[fine_grid], FMRK, 1.0, Vtemp, 0.0, &(F_fine_array[coarse_grid]));
+            // initialize solution array
+            hypre_ParVectorSetConstantValues(U_fine_array[coarse_grid], 0.0);            
+            // solve
            fine_grid_solver_solve((mgr_data -> aff_solver)[fine_grid], A_ff_array[fine_grid], F_fine_array[coarse_grid],
                  U_fine_array[coarse_grid]);
+            // update solution      
            hypre_MGRAddVectorP(CF_marker[fine_grid], FMRK, 1.0, U_fine_array[coarse_grid], 1.0, &(U_array[fine_grid]));
         }
         else
@ -756,7 +836,7 @@ hypre_MGRCycle( void               *mgr_vdata,
         alpha = 1.0;
         beta = 0.0;

-         if (restrict_type[fine_grid] > 3)
+         if (restrict_type[fine_grid] == 4 || restrict_type[fine_grid] == 5)
            use_air = 1;

         if (use_air)
--- a/src/parcsr_ls/protos.h
+++ b/src/parcsr_ls/protos.h
@ -1401,6 +1401,8 @@ HYPRE_Int hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetNumRestrictSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
+HYPRE_Int hypre_MGRSetFrelaxPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
+HYPRE_Int hypre_MGRSetCoarseGridPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetTruncateCoarseGridThreshold( void *mgr_vdata, HYPRE_Real threshold);
 HYPRE_Int hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging );
 HYPRE_Int hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter );
--- a/src/seq_mv/csr_matop.c
+++ b/src/seq_mv/csr_matop.c
@ -914,6 +914,12 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix  *A,
   hypre_CSRMatrixI(*AT)[num_cols_A] = num_nnzs_A;
   hypre_TFree(bucket, HYPRE_MEMORY_HOST);

+   /* Move diagonal to first entry (for square matrices only)*/
+   if(num_rows_A == num_cols_A)
+   {
+      hypre_CSRMatrixReorder(*AT);
+   }
+
   // Set rownnz and num_rownnz
   if (hypre_CSRMatrixNumRownnz(A) < num_rows_A)
   {
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@ -1207,6 +1207,14 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
   *AT_ptr = C;

   hypre_SyncCudaComputeStream(hypre_handle());
+   
+   /* Put diagonal at first entry (for square matrices only)*/
+   if(nrows_A == ncols_A)
+   {
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+      hypre_CSRMatrixMoveDiagFirstDevice( *AT_ptr );
+#endif      
+   }   

   return hypre_error_flag;
 }
--- a/src/test/TEST_ij/elast.saved
+++ b/src/test/TEST_ij/elast.saved
@ -24,7 +24,7 @@ Final Relative Residual Norm = 7.212489e-09

 # Output file: elast.out.6
 Iterations = 14
-Final Relative Residual Norm = 1.005958e-09
+Final Relative Residual Norm = 1.005959e-09

 # Output file: elast.out.7
 Iterations = 16
--- a/src/test/TEST_ij/solvers.saved
+++ b/src/test/TEST_ij/solvers.saved
@ -339,7 +339,7 @@ Final Relative Residual Norm = 7.556742e-09

 # Output file: solvers.out.307
 hypre_ILU Iterations = 38
-Final Relative Residual Norm = 7.037599e-09
+Final Relative Residual Norm = 7.037600e-09

 # Output file: solvers.out.308
 hypre_ILU Iterations = 26