diff --git a/src/parcsr_ls/HYPRE_parcsr_ls.h b/src/parcsr_ls/HYPRE_parcsr_ls.h
index 1d9420053..6329ebfe7 100644
--- a/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/src/parcsr_ls/HYPRE_parcsr_ls.h
@@ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver,
  * (Optional) Sets the solver type for solving local linear systems in FSAI. This
  * option makes sense only for GPU runs.
  *
+ *      - 0: Gauss-Jordan solver
  *      - 1: Vendor solver (cuSOLVER/rocSOLVER)
  *      - 2: MAGMA solver
  **/
diff --git a/src/parcsr_ls/par_amg.c b/src/parcsr_ls/par_amg.c
index a8ef32827..d4c22438e 100644
--- a/src/parcsr_ls/par_amg.c
+++ b/src/parcsr_ls/par_amg.c
@@ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void )
    {
       fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1;
    }
-   fsai_local_solve_type = 1;
-   fsai_max_steps = 5;
-   fsai_max_step_size = 3;
-   fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size;
-   fsai_num_levels = 2;
+   fsai_local_solve_type = 0;
+   fsai_max_steps = 4;
+   fsai_max_step_size = 2;
+   fsai_max_nnz_row = 8;
+   fsai_num_levels = 0;
    fsai_threshold = 0.01;
    fsai_eig_maxiter = 5;
    fsai_kap_tolerance = 0.001;
diff --git a/src/parcsr_ls/par_fsai.c b/src/parcsr_ls/par_fsai.c
index 440a8dc93..9ba9e1689 100644
--- a/src/parcsr_ls/par_fsai.c
+++ b/src/parcsr_ls/par_fsai.c
@@ -45,7 +45,7 @@ hypre_FSAICreate( void )
    fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST);
 
    /* setup params */
-   local_solve_type = 1;
+   local_solve_type = 0;
    max_steps = 3;
    max_step_size = 5;
    max_nnz_row = max_steps * max_step_size;
diff --git a/src/parcsr_ls/par_fsai_device.c b/src/parcsr_ls/par_fsai_device.c
index 7ca881089..1ac0aadd3 100644
--- a/src/parcsr_ls/par_fsai_device.c
+++ b/src/parcsr_ls/par_fsai_device.c
@@ -11,12 +11,121 @@
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
-#define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j]
-#define rhs_(ldim, i, j)    rhs_data[ldim * i + j]
-#define sol_(ldim, i, j)    sol_data[ldim * i + j]
+#define mat_(l, k, i, j) mat_data[l * (l * k + i) + j]
+#define rhs_(l, i, j)    rhs_data[l * i + j]
+#define sol_(l, i, j)    sol_data[l * i + j]
+#define ls_(i, j)        ls_data[batch_dim * j + i]
 
 #define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C))
 
+/*--------------------------------------------------------------------------
+ * hypreGPUKernel_BatchedGaussJordanSolve
+ *--------------------------------------------------------------------------*/
+
+__global__ void
+__launch_bounds__(1024, 1)
+hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem  &item,
+                                        HYPRE_Int          batch_num_items,
+                                        HYPRE_Int          batch_dim,
+                                        HYPRE_Complex     *mat_data,
+                                        HYPRE_Complex     *rhs_data,
+                                        HYPRE_Complex     *sol_data )
+{
+   extern __shared__ void* shmem[];
+
+   HYPRE_Complex    *ls_data = (HYPRE_Complex*) shmem;
+   HYPRE_Complex    *coef    = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1));
+   HYPRE_Int        *pos     = (HYPRE_Int*) (coef + 2);
+
+   HYPRE_Int         tidx    = threadIdx.x;
+   HYPRE_Int         tidy    = threadIdx.y;
+   HYPRE_Int         btid    = blockIdx.y * gridDim.x + blockIdx.x;
+
+   HYPRE_Int         i, k;
+   HYPRE_Int         posA;
+   HYPRE_Complex     coefA, coefB;
+   HYPRE_Complex    *ptrA;
+
+   if (btid < batch_num_items)
+   {
+      /* Shift to LS belonging to the current batch ID (btid) */
+      mat_data += btid * batch_dim * batch_dim;
+      rhs_data += btid * batch_dim;
+      sol_data += btid * batch_dim;
+
+      /* Copy matrix into shared memory */
+      if (tidy < batch_dim)
+      {
+         ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx];
+      }
+
+      /* Copy RHS into shared memory */
+      if (tidy == batch_dim)
+      {
+         ls_(tidx, tidy) = rhs_data[tidx];
+      }
+
+      /* Perform elimination */
+      for (k = 0; k < batch_dim; k++)
+      {
+         /* Pivot computation */
+         __syncthreads();
+         if ((tidx < 2) && (tidy == 0))
+         {
+            i = k + 1 + tidx;
+            posA  = k;
+            ptrA  = &ls_(i, k);
+            coefA = fabs(ls_(k, k));
+
+#pragma unroll 1
+            for (; i < batch_dim; i += 2)
+            {
+               coefB = fabs(*ptrA);
+               if (coefA < coefB)
+               {
+                  coefA = coefB;
+                  posA  = i;
+               }
+               ptrA += 2;
+            }
+            pos[tidx]  = posA;
+            coef[tidx] = coefA;
+         }
+
+         /* Swap row coefficients */
+         __syncthreads();
+         if ((tidx == k) && (tidy >= k))
+         {
+            posA = (coef[1] > coef[0]) ? pos[1] : pos[0];
+
+            coefA = ls_(posA, tidy);
+            ls_(posA, tidy) = ls_(tidx, tidy);
+            ls_(tidx, tidy) = coefA;
+         }
+
+         /* Row scaling */
+         __syncthreads();
+         if ((tidx == k) && (tidy > k))
+         {
+            ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k));
+         }
+
+         /* Row elimination */
+         __syncthreads();
+         if ((tidx != k) && (tidy > k))
+         {
+            ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy);
+         }
+      }
+
+      __syncthreads();
+      if (tidy == batch_dim)
+      {
+         sol_data[tidx] = ls_(tidx, batch_dim);
+      }
+   }
+}
+
 /*--------------------------------------------------------------------
  * hypreGPUKernel_FSAIExtractSubSystems
  *
@@ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
                                       HYPRE_Int        *P_i,
                                       HYPRE_Int        *P_e,
                                       HYPRE_Int        *P_j,
-                                      HYPRE_Int         ldim,
+                                      HYPRE_Int         batch_dim,
                                       HYPRE_Complex    *mat_data,
                                       HYPRE_Complex    *rhs_data,
                                       HYPRE_Int        *G_r )
@@ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
         i += hypre_gpu_get_grid_num_warps<1, 1>(item))
    {
       /* Set identity matrix */
-      for (j = lane; j < ldim; j += HYPRE_WARP_SIZE)
+      for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE)
       {
-         mat_(ldim, i, j, j) = 1.0;
+         mat_(batch_dim, i, j, j) = 1.0;
       }
 
       if (lane == 0)
@@ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
             {
                if (lane == (hypre_ffs(bitmask) - 1))
                {
-                  rhs_(ldim, i, j - pj) = - read_only_load(A_a + k);
+                  rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k);
                }
                break;
             }
@@ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
                   if (lane == (hypre_ffs(bitmask) - 1))
                   {
                      val = read_only_load(A_a + k);
-                     mat_(ldim, i, j - pj, jj - pj) = val;
-                     mat_(ldim, i, jj - pj, j - pj) = val;
+                     mat_(batch_dim, i, j - pj, jj - pj) = val;
+                     mat_(batch_dim, i, jj - pj, j - pj) = val;
                   }
                   break;
                }
@@ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
 __global__ void
 hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
                             HYPRE_Int         num_rows,
-                            HYPRE_Int         ldim,
+                            HYPRE_Int         batch_dim,
                             HYPRE_Complex    *sol_data,
                             HYPRE_Complex    *rhs_data,
                             HYPRE_Complex    *scaling,
@@ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
         i += hypre_gpu_get_grid_num_threads<1, 1>(item))
    {
       val = scaling[i];
-      for (j = 0; j < ldim; j++)
+      for (j = 0; j < batch_dim; j++)
       {
-         val += sol_(ldim, i, j) * rhs_(ldim, i, j);
+         val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j);
       }
 
       if (val > 0)
@@ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
 __global__ void
 hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
                                   HYPRE_Int         num_rows,
-                                  HYPRE_Int         ldim,
+                                  HYPRE_Int         batch_dim,
                                   HYPRE_Complex    *sol_data,
                                   HYPRE_Complex    *scaling,
                                   HYPRE_Int        *K_i,
@@ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
          col = K_j[j];
 
          G_j[cnt + il] = col;
-         G_a[cnt + il] = sol_(ldim, i, il) * val;
+         G_a[cnt + il] = sol_(batch_dim, i, il) * val;
          il++;
       }
    }
@@ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item,
    }
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_BatchedGaussJordanSolveDevice
+ *
+ * Solve dense linear systems with less than 32 unknowns via Gauss-Jordan
+ * elimination.
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_BatchedGaussJordanSolveDevice( HYPRE_Int       batch_num_items,
+                                     HYPRE_Int       batch_dim,
+                                     HYPRE_Complex  *mat_data,
+                                     HYPRE_Complex  *rhs_data,
+                                     HYPRE_Complex  *sol_data )
+{
+   if (batch_dim > 31)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC,
+                        "Error: cannot solve for local systems larger than 31.");
+      return hypre_error_flag;
+   }
+
+   /* Assign one linear system per thread block*/
+   dim3       bDim = hypre_dim3(batch_dim, batch_dim + 1, 1);
+   dim3       gDim = hypre_dim3(batch_num_items, 1, 1);
+   HYPRE_Int  shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) +
+                                 sizeof(HYPRE_Int) * 2);
+
+   HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size,
+                     batch_num_items, batch_dim, mat_data, rhs_data, sol_data);
+
+   return hypre_error_flag;
+}
+
 /*--------------------------------------------------------------------------
  * hypre_FSAIExtractSubSystemsDevice
  *
@@ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,
                                    HYPRE_Int      *P_i,
                                    HYPRE_Int      *P_e,
                                    HYPRE_Int      *P_j,
-                                   HYPRE_Int       ldim,
+                                   HYPRE_Int       batch_dim,
                                    HYPRE_Complex  *mat_data,
                                    HYPRE_Complex  *rhs_data,
                                    HYPRE_Int      *G_r )
@@ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim);
 
    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows,
-                     A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r );
+                     A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r );
 
    return hypre_error_flag;
 }
@@ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,
 
 HYPRE_Int
 hypre_FSAIScalingDevice( HYPRE_Int       num_rows,
-                         HYPRE_Int       ldim,
+                         HYPRE_Int       batch_dim,
                          HYPRE_Complex  *sol_data,
                          HYPRE_Complex  *rhs_data,
                          HYPRE_Complex  *scaling,
@@ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
 
    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim,
-                     num_rows, ldim, sol_data, rhs_data, scaling, info );
+                     num_rows, batch_dim, sol_data, rhs_data, scaling, info );
 
    return hypre_error_flag;
 }
@@ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows,
 
 HYPRE_Int
 hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows,
-                               HYPRE_Int       ldim,
+                               HYPRE_Int       batch_dim,
                                HYPRE_Complex  *sol_data,
                                HYPRE_Complex  *scaling,
                                HYPRE_Int      *K_i,
@@ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
 
    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim,
-                     num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
+                     num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
 
    return hypre_error_flag;
 }
@@ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
    HYPRE_Int               block_size       = max_nnz_row * max_nnz_row;
    HYPRE_Int               num_nonzeros_G;
 
+   HYPRE_Complex         **sol_aop = NULL;
+   HYPRE_Complex         **mat_aop = NULL;
+
    hypre_ParCSRMatrix     *Atilde;
    hypre_ParCSRMatrix     *B;
    hypre_ParCSRMatrix     *Ktilde;
@@ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
       return hypre_error_flag;
 #endif
    }
+   else if (local_solve_type == 0)
+   {
+      if (max_nnz_row > 31)
+      {
+         hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31;
+      }
+   }
    else
    {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n");
@@ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
    hypre_GpuProfilingPopRange();
 
    /* Build array of pointers */
-   hypre_GpuProfilingPushRange("Storage2");
-   HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
-   hypre_GpuProfilingPopRange();
+   if (local_solve_type != 0)
+   {
+      hypre_GpuProfilingPushRange("Storage2");
+      sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
+      mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
+      hypre_GpuProfilingPopRange();
 
-   hypre_GpuProfilingPushRange("FormAOP");
-   hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
-   hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
-   hypre_GpuProfilingPopRange();
+      hypre_GpuProfilingPushRange("FormAOP");
+      hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
+      hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
+      hypre_GpuProfilingPopRange();
+   }
 
    /*-----------------------------------------------------
     *  Solve local linear systems
@@ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
 
       hypre_GpuProfilingPushRange("Solve");
 
-      if (local_solve_type == 1)
+      if (local_solve_type == 0)
+      {
+         hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data);
+      }
+      else if (local_solve_type == 1)
       {
 #if defined (HYPRE_USING_CUSOLVER)
          HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle,
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 00d6e4fb0..4dec11e1d 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck()
    const hypre_int cuda_arch_actual_minor  = cuda_arch_actual  % 100;
    const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100;
 
-   if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor)
+   if (cuda_arch_actual_major != cuda_arch_compile_major ||
+       cuda_arch_actual_minor < cuda_arch_compile_minor)
    {
       char msg[256];