Improve FSAI performance for GPUs (#1010)

* Add Gauss-Jordan solve implementation for GPUs * Improve FSAI defaults * Update description of HYPRE_FSAISetAlgoType
2023-11-15 20:45:04 -05:00 · 2023-11-15 20:45:04 -05:00 · a67b6acc52
commit a67b6acc52
parent c215800934
5 changed files with 197 additions and 36 deletions
--- a/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/src/parcsr_ls/HYPRE_parcsr_ls.h
@ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver,
 * (Optional) Sets the solver type for solving local linear systems in FSAI. This
 * option makes sense only for GPU runs.
 *
+ *      - 0: Gauss-Jordan solver
 *      - 1: Vendor solver (cuSOLVER/rocSOLVER)
 *      - 2: MAGMA solver
 **/
--- a/src/parcsr_ls/par_amg.c
+++ b/src/parcsr_ls/par_amg.c
@ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void )
   {
      fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1;
   }
-   fsai_local_solve_type = 1;
-   fsai_max_steps = 5;
-   fsai_max_step_size = 3;
-   fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size;
-   fsai_num_levels = 2;
+   fsai_local_solve_type = 0;
+   fsai_max_steps = 4;
+   fsai_max_step_size = 2;
+   fsai_max_nnz_row = 8;
+   fsai_num_levels = 0;
   fsai_threshold = 0.01;
   fsai_eig_maxiter = 5;
   fsai_kap_tolerance = 0.001;
--- a/src/parcsr_ls/par_fsai.c
+++ b/src/parcsr_ls/par_fsai.c
@ -45,7 +45,7 @@ hypre_FSAICreate( void )
   fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST);

   /* setup params */
-   local_solve_type = 1;
+   local_solve_type = 0;
   max_steps = 3;
   max_step_size = 5;
   max_nnz_row = max_steps * max_step_size;
--- a/src/parcsr_ls/par_fsai_device.c
+++ b/src/parcsr_ls/par_fsai_device.c
@ -11,12 +11,121 @@

 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

-#define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j]
-#define rhs_(ldim, i, j)    rhs_data[ldim * i + j]
-#define sol_(ldim, i, j)    sol_data[ldim * i + j]
+#define mat_(l, k, i, j) mat_data[l * (l * k + i) + j]
+#define rhs_(l, i, j)    rhs_data[l * i + j]
+#define sol_(l, i, j)    sol_data[l * i + j]
+#define ls_(i, j)        ls_data[batch_dim * j + i]

 #define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C))

+/*--------------------------------------------------------------------------
+ * hypreGPUKernel_BatchedGaussJordanSolve
+ *--------------------------------------------------------------------------*/
+
+__global__ void
+__launch_bounds__(1024, 1)
+hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem  &item,
+                                        HYPRE_Int          batch_num_items,
+                                        HYPRE_Int          batch_dim,
+                                        HYPRE_Complex     *mat_data,
+                                        HYPRE_Complex     *rhs_data,
+                                        HYPRE_Complex     *sol_data )
+{
+   extern __shared__ void* shmem[];
+
+   HYPRE_Complex    *ls_data = (HYPRE_Complex*) shmem;
+   HYPRE_Complex    *coef    = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1));
+   HYPRE_Int        *pos     = (HYPRE_Int*) (coef + 2);
+
+   HYPRE_Int         tidx    = threadIdx.x;
+   HYPRE_Int         tidy    = threadIdx.y;
+   HYPRE_Int         btid    = blockIdx.y * gridDim.x + blockIdx.x;
+
+   HYPRE_Int         i, k;
+   HYPRE_Int         posA;
+   HYPRE_Complex     coefA, coefB;
+   HYPRE_Complex    *ptrA;
+
+   if (btid < batch_num_items)
+   {
+      /* Shift to LS belonging to the current batch ID (btid) */
+      mat_data += btid * batch_dim * batch_dim;
+      rhs_data += btid * batch_dim;
+      sol_data += btid * batch_dim;
+
+      /* Copy matrix into shared memory */
+      if (tidy < batch_dim)
+      {
+         ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx];
+      }
+
+      /* Copy RHS into shared memory */
+      if (tidy == batch_dim)
+      {
+         ls_(tidx, tidy) = rhs_data[tidx];
+      }
+
+      /* Perform elimination */
+      for (k = 0; k < batch_dim; k++)
+      {
+         /* Pivot computation */
+         __syncthreads();
+         if ((tidx < 2) && (tidy == 0))
+         {
+            i = k + 1 + tidx;
+            posA  = k;
+            ptrA  = &ls_(i, k);
+            coefA = fabs(ls_(k, k));
+
+#pragma unroll 1
+            for (; i < batch_dim; i += 2)
+            {
+               coefB = fabs(*ptrA);
+               if (coefA < coefB)
+               {
+                  coefA = coefB;
+                  posA  = i;
+               }
+               ptrA += 2;
+            }
+            pos[tidx]  = posA;
+            coef[tidx] = coefA;
+         }
+
+         /* Swap row coefficients */
+         __syncthreads();
+         if ((tidx == k) && (tidy >= k))
+         {
+            posA = (coef[1] > coef[0]) ? pos[1] : pos[0];
+
+            coefA = ls_(posA, tidy);
+            ls_(posA, tidy) = ls_(tidx, tidy);
+            ls_(tidx, tidy) = coefA;
+         }
+
+         /* Row scaling */
+         __syncthreads();
+         if ((tidx == k) && (tidy > k))
+         {
+            ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k));
+         }
+
+         /* Row elimination */
+         __syncthreads();
+         if ((tidx != k) && (tidy > k))
+         {
+            ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy);
+         }
+      }
+
+      __syncthreads();
+      if (tidy == batch_dim)
+      {
+         sol_data[tidx] = ls_(tidx, batch_dim);
+      }
+   }
+}
+
 /*--------------------------------------------------------------------
 * hypreGPUKernel_FSAIExtractSubSystems
 *
@ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
                                      HYPRE_Int        *P_i,
                                      HYPRE_Int        *P_e,
                                      HYPRE_Int        *P_j,
-                                      HYPRE_Int         ldim,
+                                      HYPRE_Int         batch_dim,
                                      HYPRE_Complex    *mat_data,
                                      HYPRE_Complex    *rhs_data,
                                      HYPRE_Int        *G_r )
@ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
        i += hypre_gpu_get_grid_num_warps<1, 1>(item))
   {
      /* Set identity matrix */
-      for (j = lane; j < ldim; j += HYPRE_WARP_SIZE)
+      for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE)
      {
-         mat_(ldim, i, j, j) = 1.0;
+         mat_(batch_dim, i, j, j) = 1.0;
      }

      if (lane == 0)
@ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
            {
               if (lane == (hypre_ffs(bitmask) - 1))
               {
-                  rhs_(ldim, i, j - pj) = - read_only_load(A_a + k);
+                  rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k);
               }
               break;
            }
@ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
                  if (lane == (hypre_ffs(bitmask) - 1))
                  {
                     val = read_only_load(A_a + k);
-                     mat_(ldim, i, j - pj, jj - pj) = val;
-                     mat_(ldim, i, jj - pj, j - pj) = val;
+                     mat_(batch_dim, i, j - pj, jj - pj) = val;
+                     mat_(batch_dim, i, jj - pj, j - pj) = val;
                  }
                  break;
               }
@ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
 __global__ void
 hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
                            HYPRE_Int         num_rows,
-                            HYPRE_Int         ldim,
+                            HYPRE_Int         batch_dim,
                            HYPRE_Complex    *sol_data,
                            HYPRE_Complex    *rhs_data,
                            HYPRE_Complex    *scaling,
@ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
        i += hypre_gpu_get_grid_num_threads<1, 1>(item))
   {
      val = scaling[i];
-      for (j = 0; j < ldim; j++)
+      for (j = 0; j < batch_dim; j++)
      {
-         val += sol_(ldim, i, j) * rhs_(ldim, i, j);
+         val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j);
      }

      if (val > 0)
@ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
 __global__ void
 hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
                                  HYPRE_Int         num_rows,
-                                  HYPRE_Int         ldim,
+                                  HYPRE_Int         batch_dim,
                                  HYPRE_Complex    *sol_data,
                                  HYPRE_Complex    *scaling,
                                  HYPRE_Int        *K_i,
@ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
         col = K_j[j];

         G_j[cnt + il] = col;
-         G_a[cnt + il] = sol_(ldim, i, il) * val;
+         G_a[cnt + il] = sol_(batch_dim, i, il) * val;
         il++;
      }
   }
@ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item,
   }
 }

+/*--------------------------------------------------------------------------
+ * hypre_BatchedGaussJordanSolveDevice
+ *
+ * Solve dense linear systems with less than 32 unknowns via Gauss-Jordan
+ * elimination.
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_BatchedGaussJordanSolveDevice( HYPRE_Int       batch_num_items,
+                                     HYPRE_Int       batch_dim,
+                                     HYPRE_Complex  *mat_data,
+                                     HYPRE_Complex  *rhs_data,
+                                     HYPRE_Complex  *sol_data )
+{
+   if (batch_dim > 31)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC,
+                        "Error: cannot solve for local systems larger than 31.");
+      return hypre_error_flag;
+   }
+
+   /* Assign one linear system per thread block*/
+   dim3       bDim = hypre_dim3(batch_dim, batch_dim + 1, 1);
+   dim3       gDim = hypre_dim3(batch_num_items, 1, 1);
+   HYPRE_Int  shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) +
+                                 sizeof(HYPRE_Int) * 2);
+
+   HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size,
+                     batch_num_items, batch_dim, mat_data, rhs_data, sol_data);
+
+   return hypre_error_flag;
+}
+
 /*--------------------------------------------------------------------------
 * hypre_FSAIExtractSubSystemsDevice
 *
@ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,
                                   HYPRE_Int      *P_i,
                                   HYPRE_Int      *P_e,
                                   HYPRE_Int      *P_j,
-                                   HYPRE_Int       ldim,
+                                   HYPRE_Int       batch_dim,
                                   HYPRE_Complex  *mat_data,
                                   HYPRE_Complex  *rhs_data,
                                   HYPRE_Int      *G_r )
@ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,
   dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim);

   HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows,
-                     A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r );
+                     A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r );

   return hypre_error_flag;
 }
@ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows,

 HYPRE_Int
 hypre_FSAIScalingDevice( HYPRE_Int       num_rows,
-                         HYPRE_Int       ldim,
+                         HYPRE_Int       batch_dim,
                         HYPRE_Complex  *sol_data,
                         HYPRE_Complex  *rhs_data,
                         HYPRE_Complex  *scaling,
@ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows,
   dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);

   HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim,
-                     num_rows, ldim, sol_data, rhs_data, scaling, info );
+                     num_rows, batch_dim, sol_data, rhs_data, scaling, info );

   return hypre_error_flag;
 }
@ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows,

 HYPRE_Int
 hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows,
-                               HYPRE_Int       ldim,
+                               HYPRE_Int       batch_dim,
                               HYPRE_Complex  *sol_data,
                               HYPRE_Complex  *scaling,
                               HYPRE_Int      *K_i,
@ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows,
   dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);

   HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim,
-                     num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
+                     num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );

   return hypre_error_flag;
 }
@ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
   HYPRE_Int               block_size       = max_nnz_row * max_nnz_row;
   HYPRE_Int               num_nonzeros_G;

+   HYPRE_Complex         **sol_aop = NULL;
+   HYPRE_Complex         **mat_aop = NULL;
+
   hypre_ParCSRMatrix     *Atilde;
   hypre_ParCSRMatrix     *B;
   hypre_ParCSRMatrix     *Ktilde;
@ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
      return hypre_error_flag;
 #endif
   }
+   else if (local_solve_type == 0)
+   {
+      if (max_nnz_row > 31)
+      {
+         hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31;
+      }
+   }
   else
   {
      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n");
@ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,
   hypre_GpuProfilingPopRange();

   /* Build array of pointers */
-   hypre_GpuProfilingPushRange("Storage2");
-   HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
-   hypre_GpuProfilingPopRange();
+   if (local_solve_type != 0)
+   {
+      hypre_GpuProfilingPushRange("Storage2");
+      sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
+      mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
+      hypre_GpuProfilingPopRange();

-   hypre_GpuProfilingPushRange("FormAOP");
-   hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
-   hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
-   hypre_GpuProfilingPopRange();
+      hypre_GpuProfilingPushRange("FormAOP");
+      hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
+      hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
+      hypre_GpuProfilingPopRange();
+   }

   /*-----------------------------------------------------
    *  Solve local linear systems
@ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata,

      hypre_GpuProfilingPushRange("Solve");

-      if (local_solve_type == 1)
+      if (local_solve_type == 0)
+      {
+         hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data);
+      }
+      else if (local_solve_type == 1)
      {
 #if defined (HYPRE_USING_CUSOLVER)
         HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle,
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck()
   const hypre_int cuda_arch_actual_minor  = cuda_arch_actual  % 100;
   const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100;

-   if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor)
+   if (cuda_arch_actual_major != cuda_arch_compile_major ||
+       cuda_arch_actual_minor < cuda_arch_compile_minor)
   {
      char msg[256];