diff --git a/src/parcsr_ls/HYPRE_parcsr_ls.h b/src/parcsr_ls/HYPRE_parcsr_ls.h index 1d9420053..6329ebfe7 100644 --- a/src/parcsr_ls/HYPRE_parcsr_ls.h +++ b/src/parcsr_ls/HYPRE_parcsr_ls.h @@ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver, * (Optional) Sets the solver type for solving local linear systems in FSAI. This * option makes sense only for GPU runs. * + * - 0: Gauss-Jordan solver * - 1: Vendor solver (cuSOLVER/rocSOLVER) * - 2: MAGMA solver **/ diff --git a/src/parcsr_ls/par_amg.c b/src/parcsr_ls/par_amg.c index a8ef32827..d4c22438e 100644 --- a/src/parcsr_ls/par_amg.c +++ b/src/parcsr_ls/par_amg.c @@ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void ) { fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1; } - fsai_local_solve_type = 1; - fsai_max_steps = 5; - fsai_max_step_size = 3; - fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size; - fsai_num_levels = 2; + fsai_local_solve_type = 0; + fsai_max_steps = 4; + fsai_max_step_size = 2; + fsai_max_nnz_row = 8; + fsai_num_levels = 0; fsai_threshold = 0.01; fsai_eig_maxiter = 5; fsai_kap_tolerance = 0.001; diff --git a/src/parcsr_ls/par_fsai.c b/src/parcsr_ls/par_fsai.c index 440a8dc93..9ba9e1689 100644 --- a/src/parcsr_ls/par_fsai.c +++ b/src/parcsr_ls/par_fsai.c @@ -45,7 +45,7 @@ hypre_FSAICreate( void ) fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST); /* setup params */ - local_solve_type = 1; + local_solve_type = 0; max_steps = 3; max_step_size = 5; max_nnz_row = max_steps * max_step_size; diff --git a/src/parcsr_ls/par_fsai_device.c b/src/parcsr_ls/par_fsai_device.c index 7ca881089..1ac0aadd3 100644 --- a/src/parcsr_ls/par_fsai_device.c +++ b/src/parcsr_ls/par_fsai_device.c @@ -11,12 +11,121 @@ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) -#define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j] -#define rhs_(ldim, i, j) rhs_data[ldim * i + j] -#define sol_(ldim, i, j) sol_data[ldim * i + j] +#define mat_(l, k, i, j) mat_data[l * (l * k + i) + j] +#define rhs_(l, i, j) rhs_data[l * i + j] +#define sol_(l, i, j) sol_data[l * i + j] +#define ls_(i, j) ls_data[batch_dim * j + i] #define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C)) +/*-------------------------------------------------------------------------- + * hypreGPUKernel_BatchedGaussJordanSolve + *--------------------------------------------------------------------------*/ + +__global__ void +__launch_bounds__(1024, 1) +hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem &item, + HYPRE_Int batch_num_items, + HYPRE_Int batch_dim, + HYPRE_Complex *mat_data, + HYPRE_Complex *rhs_data, + HYPRE_Complex *sol_data ) +{ + extern __shared__ void* shmem[]; + + HYPRE_Complex *ls_data = (HYPRE_Complex*) shmem; + HYPRE_Complex *coef = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1)); + HYPRE_Int *pos = (HYPRE_Int*) (coef + 2); + + HYPRE_Int tidx = threadIdx.x; + HYPRE_Int tidy = threadIdx.y; + HYPRE_Int btid = blockIdx.y * gridDim.x + blockIdx.x; + + HYPRE_Int i, k; + HYPRE_Int posA; + HYPRE_Complex coefA, coefB; + HYPRE_Complex *ptrA; + + if (btid < batch_num_items) + { + /* Shift to LS belonging to the current batch ID (btid) */ + mat_data += btid * batch_dim * batch_dim; + rhs_data += btid * batch_dim; + sol_data += btid * batch_dim; + + /* Copy matrix into shared memory */ + if (tidy < batch_dim) + { + ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx]; + } + + /* Copy RHS into shared memory */ + if (tidy == batch_dim) + { + ls_(tidx, tidy) = rhs_data[tidx]; + } + + /* Perform elimination */ + for (k = 0; k < batch_dim; k++) + { + /* Pivot computation */ + __syncthreads(); + if ((tidx < 2) && (tidy == 0)) + { + i = k + 1 + tidx; + posA = k; + ptrA = &ls_(i, k); + coefA = fabs(ls_(k, k)); + +#pragma unroll 1 + for (; i < batch_dim; i += 2) + { + coefB = fabs(*ptrA); + if (coefA < coefB) + { + coefA = coefB; + posA = i; + } + ptrA += 2; + } + pos[tidx] = posA; + coef[tidx] = coefA; + } + + /* Swap row coefficients */ + __syncthreads(); + if ((tidx == k) && (tidy >= k)) + { + posA = (coef[1] > coef[0]) ? pos[1] : pos[0]; + + coefA = ls_(posA, tidy); + ls_(posA, tidy) = ls_(tidx, tidy); + ls_(tidx, tidy) = coefA; + } + + /* Row scaling */ + __syncthreads(); + if ((tidx == k) && (tidy > k)) + { + ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k)); + } + + /* Row elimination */ + __syncthreads(); + if ((tidx != k) && (tidy > k)) + { + ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy); + } + } + + __syncthreads(); + if (tidy == batch_dim) + { + sol_data[tidx] = ls_(tidx, batch_dim); + } + } +} + /*-------------------------------------------------------------------- * hypreGPUKernel_FSAIExtractSubSystems * @@ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, HYPRE_Int *P_i, HYPRE_Int *P_e, HYPRE_Int *P_j, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *mat_data, HYPRE_Complex *rhs_data, HYPRE_Int *G_r ) @@ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, i += hypre_gpu_get_grid_num_warps<1, 1>(item)) { /* Set identity matrix */ - for (j = lane; j < ldim; j += HYPRE_WARP_SIZE) + for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE) { - mat_(ldim, i, j, j) = 1.0; + mat_(batch_dim, i, j, j) = 1.0; } if (lane == 0) @@ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, { if (lane == (hypre_ffs(bitmask) - 1)) { - rhs_(ldim, i, j - pj) = - read_only_load(A_a + k); + rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k); } break; } @@ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, if (lane == (hypre_ffs(bitmask) - 1)) { val = read_only_load(A_a + k); - mat_(ldim, i, j - pj, jj - pj) = val; - mat_(ldim, i, jj - pj, j - pj) = val; + mat_(batch_dim, i, j - pj, jj - pj) = val; + mat_(batch_dim, i, jj - pj, j - pj) = val; } break; } @@ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item, __global__ void hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, HYPRE_Int num_rows, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *sol_data, HYPRE_Complex *rhs_data, HYPRE_Complex *scaling, @@ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, i += hypre_gpu_get_grid_num_threads<1, 1>(item)) { val = scaling[i]; - for (j = 0; j < ldim; j++) + for (j = 0; j < batch_dim; j++) { - val += sol_(ldim, i, j) * rhs_(ldim, i, j); + val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j); } if (val > 0) @@ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item, __global__ void hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item, HYPRE_Int num_rows, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *sol_data, HYPRE_Complex *scaling, HYPRE_Int *K_i, @@ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item, col = K_j[j]; G_j[cnt + il] = col; - G_a[cnt + il] = sol_(ldim, i, il) * val; + G_a[cnt + il] = sol_(batch_dim, i, il) * val; il++; } } @@ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item, } } +/*-------------------------------------------------------------------------- + * hypre_BatchedGaussJordanSolveDevice + * + * Solve dense linear systems with less than 32 unknowns via Gauss-Jordan + * elimination. + *--------------------------------------------------------------------------*/ + +HYPRE_Int +hypre_BatchedGaussJordanSolveDevice( HYPRE_Int batch_num_items, + HYPRE_Int batch_dim, + HYPRE_Complex *mat_data, + HYPRE_Complex *rhs_data, + HYPRE_Complex *sol_data ) +{ + if (batch_dim > 31) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, + "Error: cannot solve for local systems larger than 31."); + return hypre_error_flag; + } + + /* Assign one linear system per thread block*/ + dim3 bDim = hypre_dim3(batch_dim, batch_dim + 1, 1); + dim3 gDim = hypre_dim3(batch_num_items, 1, 1); + HYPRE_Int shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) + + sizeof(HYPRE_Int) * 2); + + HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size, + batch_num_items, batch_dim, mat_data, rhs_data, sol_data); + + return hypre_error_flag; +} + /*-------------------------------------------------------------------------- * hypre_FSAIExtractSubSystemsDevice * @@ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows, HYPRE_Int *P_i, HYPRE_Int *P_e, HYPRE_Int *P_j, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *mat_data, HYPRE_Complex *rhs_data, HYPRE_Int *G_r ) @@ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows, dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim); HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows, - A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r ); + A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r ); return hypre_error_flag; } @@ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows, HYPRE_Int hypre_FSAIScalingDevice( HYPRE_Int num_rows, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *sol_data, HYPRE_Complex *rhs_data, HYPRE_Complex *scaling, @@ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows, dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim, - num_rows, ldim, sol_data, rhs_data, scaling, info ); + num_rows, batch_dim, sol_data, rhs_data, scaling, info ); return hypre_error_flag; } @@ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows, HYPRE_Int hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows, - HYPRE_Int ldim, + HYPRE_Int batch_dim, HYPRE_Complex *sol_data, HYPRE_Complex *scaling, HYPRE_Int *K_i, @@ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows, dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim, - num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a ); + num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a ); return hypre_error_flag; } @@ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata, HYPRE_Int block_size = max_nnz_row * max_nnz_row; HYPRE_Int num_nonzeros_G; + HYPRE_Complex **sol_aop = NULL; + HYPRE_Complex **mat_aop = NULL; + hypre_ParCSRMatrix *Atilde; hypre_ParCSRMatrix *B; hypre_ParCSRMatrix *Ktilde; @@ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata, return hypre_error_flag; #endif } + else if (local_solve_type == 0) + { + if (max_nnz_row > 31) + { + hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31; + } + } else { hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n"); @@ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata, hypre_GpuProfilingPopRange(); /* Build array of pointers */ - hypre_GpuProfilingPushRange("Storage2"); - HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); - HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); - hypre_GpuProfilingPopRange(); + if (local_solve_type != 0) + { + hypre_GpuProfilingPushRange("Storage2"); + sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); + mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE); + hypre_GpuProfilingPopRange(); - hypre_GpuProfilingPushRange("FormAOP"); - hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop); - hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop); - hypre_GpuProfilingPopRange(); + hypre_GpuProfilingPushRange("FormAOP"); + hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop); + hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop); + hypre_GpuProfilingPopRange(); + } /*----------------------------------------------------- * Solve local linear systems @@ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata, hypre_GpuProfilingPushRange("Solve"); - if (local_solve_type == 1) + if (local_solve_type == 0) + { + hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data); + } + else if (local_solve_type == 1) { #if defined (HYPRE_USING_CUSOLVER) HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle, diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 00d6e4fb0..4dec11e1d 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck() const hypre_int cuda_arch_actual_minor = cuda_arch_actual % 100; const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100; - if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor) + if (cuda_arch_actual_major != cuda_arch_compile_major || + cuda_arch_actual_minor < cuda_arch_compile_minor) { char msg[256];