Improve FSAI performance for GPUs (#1010)
* Add Gauss-Jordan solve implementation for GPUs * Improve FSAI defaults * Update description of HYPRE_FSAISetAlgoType
This commit is contained in:
		
							parent
							
								
									c215800934
								
							
						
					
					
						commit
						a67b6acc52
					
				| @ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver, | ||||
|  * (Optional) Sets the solver type for solving local linear systems in FSAI. This | ||||
|  * option makes sense only for GPU runs. | ||||
|  * | ||||
|  *      - 0: Gauss-Jordan solver | ||||
|  *      - 1: Vendor solver (cuSOLVER/rocSOLVER) | ||||
|  *      - 2: MAGMA solver | ||||
|  **/ | ||||
|  | ||||
| @ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void ) | ||||
|    { | ||||
|       fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1; | ||||
|    } | ||||
|    fsai_local_solve_type = 1; | ||||
|    fsai_max_steps = 5; | ||||
|    fsai_max_step_size = 3; | ||||
|    fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size; | ||||
|    fsai_num_levels = 2; | ||||
|    fsai_local_solve_type = 0; | ||||
|    fsai_max_steps = 4; | ||||
|    fsai_max_step_size = 2; | ||||
|    fsai_max_nnz_row = 8; | ||||
|    fsai_num_levels = 0; | ||||
|    fsai_threshold = 0.01; | ||||
|    fsai_eig_maxiter = 5; | ||||
|    fsai_kap_tolerance = 0.001; | ||||
|  | ||||
| @ -45,7 +45,7 @@ hypre_FSAICreate( void ) | ||||
|    fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST); | ||||
| 
 | ||||
|    /* setup params */ | ||||
|    local_solve_type = 1; | ||||
|    local_solve_type = 0; | ||||
|    max_steps = 3; | ||||
|    max_step_size = 5; | ||||
|    max_nnz_row = max_steps * max_step_size; | ||||
|  | ||||
| @ -11,12 +11,121 @@ | ||||
| 
 | ||||
| #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) | ||||
| 
 | ||||
| #define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j] | ||||
| #define rhs_(ldim, i, j)    rhs_data[ldim * i + j] | ||||
| #define sol_(ldim, i, j)    sol_data[ldim * i + j] | ||||
| #define mat_(l, k, i, j) mat_data[l * (l * k + i) + j] | ||||
| #define rhs_(l, i, j)    rhs_data[l * i + j] | ||||
| #define sol_(l, i, j)    sol_data[l * i + j] | ||||
| #define ls_(i, j)        ls_data[batch_dim * j + i] | ||||
| 
 | ||||
| #define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C)) | ||||
| 
 | ||||
| /*--------------------------------------------------------------------------
 | ||||
|  * hypreGPUKernel_BatchedGaussJordanSolve | ||||
|  *--------------------------------------------------------------------------*/ | ||||
| 
 | ||||
| __global__ void | ||||
| __launch_bounds__(1024, 1) | ||||
| hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem  &item, | ||||
|                                         HYPRE_Int          batch_num_items, | ||||
|                                         HYPRE_Int          batch_dim, | ||||
|                                         HYPRE_Complex     *mat_data, | ||||
|                                         HYPRE_Complex     *rhs_data, | ||||
|                                         HYPRE_Complex     *sol_data ) | ||||
| { | ||||
|    extern __shared__ void* shmem[]; | ||||
| 
 | ||||
|    HYPRE_Complex    *ls_data = (HYPRE_Complex*) shmem; | ||||
|    HYPRE_Complex    *coef    = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1)); | ||||
|    HYPRE_Int        *pos     = (HYPRE_Int*) (coef + 2); | ||||
| 
 | ||||
|    HYPRE_Int         tidx    = threadIdx.x; | ||||
|    HYPRE_Int         tidy    = threadIdx.y; | ||||
|    HYPRE_Int         btid    = blockIdx.y * gridDim.x + blockIdx.x; | ||||
| 
 | ||||
|    HYPRE_Int         i, k; | ||||
|    HYPRE_Int         posA; | ||||
|    HYPRE_Complex     coefA, coefB; | ||||
|    HYPRE_Complex    *ptrA; | ||||
| 
 | ||||
|    if (btid < batch_num_items) | ||||
|    { | ||||
|       /* Shift to LS belonging to the current batch ID (btid) */ | ||||
|       mat_data += btid * batch_dim * batch_dim; | ||||
|       rhs_data += btid * batch_dim; | ||||
|       sol_data += btid * batch_dim; | ||||
| 
 | ||||
|       /* Copy matrix into shared memory */ | ||||
|       if (tidy < batch_dim) | ||||
|       { | ||||
|          ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx]; | ||||
|       } | ||||
| 
 | ||||
|       /* Copy RHS into shared memory */ | ||||
|       if (tidy == batch_dim) | ||||
|       { | ||||
|          ls_(tidx, tidy) = rhs_data[tidx]; | ||||
|       } | ||||
| 
 | ||||
|       /* Perform elimination */ | ||||
|       for (k = 0; k < batch_dim; k++) | ||||
|       { | ||||
|          /* Pivot computation */ | ||||
|          __syncthreads(); | ||||
|          if ((tidx < 2) && (tidy == 0)) | ||||
|          { | ||||
|             i = k + 1 + tidx; | ||||
|             posA  = k; | ||||
|             ptrA  = &ls_(i, k); | ||||
|             coefA = fabs(ls_(k, k)); | ||||
| 
 | ||||
| #pragma unroll 1 | ||||
|             for (; i < batch_dim; i += 2) | ||||
|             { | ||||
|                coefB = fabs(*ptrA); | ||||
|                if (coefA < coefB) | ||||
|                { | ||||
|                   coefA = coefB; | ||||
|                   posA  = i; | ||||
|                } | ||||
|                ptrA += 2; | ||||
|             } | ||||
|             pos[tidx]  = posA; | ||||
|             coef[tidx] = coefA; | ||||
|          } | ||||
| 
 | ||||
|          /* Swap row coefficients */ | ||||
|          __syncthreads(); | ||||
|          if ((tidx == k) && (tidy >= k)) | ||||
|          { | ||||
|             posA = (coef[1] > coef[0]) ? pos[1] : pos[0]; | ||||
| 
 | ||||
|             coefA = ls_(posA, tidy); | ||||
|             ls_(posA, tidy) = ls_(tidx, tidy); | ||||
|             ls_(tidx, tidy) = coefA; | ||||
|          } | ||||
| 
 | ||||
|          /* Row scaling */ | ||||
|          __syncthreads(); | ||||
|          if ((tidx == k) && (tidy > k)) | ||||
|          { | ||||
|             ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k)); | ||||
|          } | ||||
| 
 | ||||
|          /* Row elimination */ | ||||
|          __syncthreads(); | ||||
|          if ((tidx != k) && (tidy > k)) | ||||
|          { | ||||
|             ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy); | ||||
|          } | ||||
|       } | ||||
| 
 | ||||
|       __syncthreads(); | ||||
|       if (tidy == batch_dim) | ||||
|       { | ||||
|          sol_data[tidx] = ls_(tidx, batch_dim); | ||||
|       } | ||||
|    } | ||||
| } | ||||
| 
 | ||||
| /*--------------------------------------------------------------------
 | ||||
|  * hypreGPUKernel_FSAIExtractSubSystems | ||||
|  * | ||||
| @ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, | ||||
|                                       HYPRE_Int        *P_i, | ||||
|                                       HYPRE_Int        *P_e, | ||||
|                                       HYPRE_Int        *P_j, | ||||
|                                       HYPRE_Int         ldim, | ||||
|                                       HYPRE_Int         batch_dim, | ||||
|                                       HYPRE_Complex    *mat_data, | ||||
|                                       HYPRE_Complex    *rhs_data, | ||||
|                                       HYPRE_Int        *G_r ) | ||||
| @ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, | ||||
|         i += hypre_gpu_get_grid_num_warps<1, 1>(item)) | ||||
|    { | ||||
|       /* Set identity matrix */ | ||||
|       for (j = lane; j < ldim; j += HYPRE_WARP_SIZE) | ||||
|       for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE) | ||||
|       { | ||||
|          mat_(ldim, i, j, j) = 1.0; | ||||
|          mat_(batch_dim, i, j, j) = 1.0; | ||||
|       } | ||||
| 
 | ||||
|       if (lane == 0) | ||||
| @ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, | ||||
|             { | ||||
|                if (lane == (hypre_ffs(bitmask) - 1)) | ||||
|                { | ||||
|                   rhs_(ldim, i, j - pj) = - read_only_load(A_a + k); | ||||
|                   rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k); | ||||
|                } | ||||
|                break; | ||||
|             } | ||||
| @ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, | ||||
|                   if (lane == (hypre_ffs(bitmask) - 1)) | ||||
|                   { | ||||
|                      val = read_only_load(A_a + k); | ||||
|                      mat_(ldim, i, j - pj, jj - pj) = val; | ||||
|                      mat_(ldim, i, jj - pj, j - pj) = val; | ||||
|                      mat_(batch_dim, i, j - pj, jj - pj) = val; | ||||
|                      mat_(batch_dim, i, jj - pj, j - pj) = val; | ||||
|                   } | ||||
|                   break; | ||||
|                } | ||||
| @ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, | ||||
| __global__ void | ||||
| hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, | ||||
|                             HYPRE_Int         num_rows, | ||||
|                             HYPRE_Int         ldim, | ||||
|                             HYPRE_Int         batch_dim, | ||||
|                             HYPRE_Complex    *sol_data, | ||||
|                             HYPRE_Complex    *rhs_data, | ||||
|                             HYPRE_Complex    *scaling, | ||||
| @ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, | ||||
|         i += hypre_gpu_get_grid_num_threads<1, 1>(item)) | ||||
|    { | ||||
|       val = scaling[i]; | ||||
|       for (j = 0; j < ldim; j++) | ||||
|       for (j = 0; j < batch_dim; j++) | ||||
|       { | ||||
|          val += sol_(ldim, i, j) * rhs_(ldim, i, j); | ||||
|          val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j); | ||||
|       } | ||||
| 
 | ||||
|       if (val > 0) | ||||
| @ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, | ||||
| __global__ void | ||||
| hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item, | ||||
|                                   HYPRE_Int         num_rows, | ||||
|                                   HYPRE_Int         ldim, | ||||
|                                   HYPRE_Int         batch_dim, | ||||
|                                   HYPRE_Complex    *sol_data, | ||||
|                                   HYPRE_Complex    *scaling, | ||||
|                                   HYPRE_Int        *K_i, | ||||
| @ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item, | ||||
|          col = K_j[j]; | ||||
| 
 | ||||
|          G_j[cnt + il] = col; | ||||
|          G_a[cnt + il] = sol_(ldim, i, il) * val; | ||||
|          G_a[cnt + il] = sol_(batch_dim, i, il) * val; | ||||
|          il++; | ||||
|       } | ||||
|    } | ||||
| @ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item, | ||||
|    } | ||||
| } | ||||
| 
 | ||||
| /*--------------------------------------------------------------------------
 | ||||
|  * hypre_BatchedGaussJordanSolveDevice | ||||
|  * | ||||
|  * Solve dense linear systems with less than 32 unknowns via Gauss-Jordan | ||||
|  * elimination. | ||||
|  *--------------------------------------------------------------------------*/ | ||||
| 
 | ||||
| HYPRE_Int | ||||
| hypre_BatchedGaussJordanSolveDevice( HYPRE_Int       batch_num_items, | ||||
|                                      HYPRE_Int       batch_dim, | ||||
|                                      HYPRE_Complex  *mat_data, | ||||
|                                      HYPRE_Complex  *rhs_data, | ||||
|                                      HYPRE_Complex  *sol_data ) | ||||
| { | ||||
|    if (batch_dim > 31) | ||||
|    { | ||||
|       hypre_error_w_msg(HYPRE_ERROR_GENERIC, | ||||
|                         "Error: cannot solve for local systems larger than 31."); | ||||
|       return hypre_error_flag; | ||||
|    } | ||||
| 
 | ||||
|    /* Assign one linear system per thread block*/ | ||||
|    dim3       bDim = hypre_dim3(batch_dim, batch_dim + 1, 1); | ||||
|    dim3       gDim = hypre_dim3(batch_num_items, 1, 1); | ||||
|    HYPRE_Int  shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) + | ||||
|                                  sizeof(HYPRE_Int) * 2); | ||||
| 
 | ||||
|    HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size, | ||||
|                      batch_num_items, batch_dim, mat_data, rhs_data, sol_data); | ||||
| 
 | ||||
|    return hypre_error_flag; | ||||
| } | ||||
| 
 | ||||
| /*--------------------------------------------------------------------------
 | ||||
|  * hypre_FSAIExtractSubSystemsDevice | ||||
|  * | ||||
| @ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows, | ||||
|                                    HYPRE_Int      *P_i, | ||||
|                                    HYPRE_Int      *P_e, | ||||
|                                    HYPRE_Int      *P_j, | ||||
|                                    HYPRE_Int       ldim, | ||||
|                                    HYPRE_Int       batch_dim, | ||||
|                                    HYPRE_Complex  *mat_data, | ||||
|                                    HYPRE_Complex  *rhs_data, | ||||
|                                    HYPRE_Int      *G_r ) | ||||
| @ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows, | ||||
|    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim); | ||||
| 
 | ||||
|    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows, | ||||
|                      A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r ); | ||||
|                      A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r ); | ||||
| 
 | ||||
|    return hypre_error_flag; | ||||
| } | ||||
| @ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int       num_rows, | ||||
| 
 | ||||
| HYPRE_Int | ||||
| hypre_FSAIScalingDevice( HYPRE_Int       num_rows, | ||||
|                          HYPRE_Int       ldim, | ||||
|                          HYPRE_Int       batch_dim, | ||||
|                          HYPRE_Complex  *sol_data, | ||||
|                          HYPRE_Complex  *rhs_data, | ||||
|                          HYPRE_Complex  *scaling, | ||||
| @ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows, | ||||
|    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); | ||||
| 
 | ||||
|    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim, | ||||
|                      num_rows, ldim, sol_data, rhs_data, scaling, info ); | ||||
|                      num_rows, batch_dim, sol_data, rhs_data, scaling, info ); | ||||
| 
 | ||||
|    return hypre_error_flag; | ||||
| } | ||||
| @ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int       num_rows, | ||||
| 
 | ||||
| HYPRE_Int | ||||
| hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows, | ||||
|                                HYPRE_Int       ldim, | ||||
|                                HYPRE_Int       batch_dim, | ||||
|                                HYPRE_Complex  *sol_data, | ||||
|                                HYPRE_Complex  *scaling, | ||||
|                                HYPRE_Int      *K_i, | ||||
| @ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int       num_rows, | ||||
|    dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); | ||||
| 
 | ||||
|    HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim, | ||||
|                      num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a ); | ||||
|                      num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a ); | ||||
| 
 | ||||
|    return hypre_error_flag; | ||||
| } | ||||
| @ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata, | ||||
|    HYPRE_Int               block_size       = max_nnz_row * max_nnz_row; | ||||
|    HYPRE_Int               num_nonzeros_G; | ||||
| 
 | ||||
|    HYPRE_Complex         **sol_aop = NULL; | ||||
|    HYPRE_Complex         **mat_aop = NULL; | ||||
| 
 | ||||
|    hypre_ParCSRMatrix     *Atilde; | ||||
|    hypre_ParCSRMatrix     *B; | ||||
|    hypre_ParCSRMatrix     *Ktilde; | ||||
| @ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata, | ||||
|       return hypre_error_flag; | ||||
| #endif | ||||
|    } | ||||
|    else if (local_solve_type == 0) | ||||
|    { | ||||
|       if (max_nnz_row > 31) | ||||
|       { | ||||
|          hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31; | ||||
|       } | ||||
|    } | ||||
|    else | ||||
|    { | ||||
|       hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n"); | ||||
| @ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata, | ||||
|    hypre_GpuProfilingPopRange(); | ||||
| 
 | ||||
|    /* Build array of pointers */ | ||||
|    hypre_GpuProfilingPushRange("Storage2"); | ||||
|    HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); | ||||
|    HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); | ||||
|    hypre_GpuProfilingPopRange(); | ||||
|    if (local_solve_type != 0) | ||||
|    { | ||||
|       hypre_GpuProfilingPushRange("Storage2"); | ||||
|       sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); | ||||
|       mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); | ||||
|       hypre_GpuProfilingPopRange(); | ||||
| 
 | ||||
|    hypre_GpuProfilingPushRange("FormAOP"); | ||||
|    hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop); | ||||
|    hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop); | ||||
|    hypre_GpuProfilingPopRange(); | ||||
|       hypre_GpuProfilingPushRange("FormAOP"); | ||||
|       hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop); | ||||
|       hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop); | ||||
|       hypre_GpuProfilingPopRange(); | ||||
|    } | ||||
| 
 | ||||
|    /*-----------------------------------------------------
 | ||||
|     *  Solve local linear systems | ||||
| @ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void               *fsai_vdata, | ||||
| 
 | ||||
|       hypre_GpuProfilingPushRange("Solve"); | ||||
| 
 | ||||
|       if (local_solve_type == 1) | ||||
|       if (local_solve_type == 0) | ||||
|       { | ||||
|          hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data); | ||||
|       } | ||||
|       else if (local_solve_type == 1) | ||||
|       { | ||||
| #if defined (HYPRE_USING_CUSOLVER) | ||||
|          HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle, | ||||
|  | ||||
| @ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck() | ||||
|    const hypre_int cuda_arch_actual_minor  = cuda_arch_actual  % 100; | ||||
|    const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100; | ||||
| 
 | ||||
|    if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor) | ||||
|    if (cuda_arch_actual_major != cuda_arch_compile_major || | ||||
|        cuda_arch_actual_minor < cuda_arch_compile_minor) | ||||
|    { | ||||
|       char msg[256]; | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Victor A. P. Magri
						Victor A. P. Magri