Improve FSAI performance for GPUs (#1010)
* Add Gauss-Jordan solve implementation for GPUs * Improve FSAI defaults * Update description of HYPRE_FSAISetAlgoType
This commit is contained in:
parent
c215800934
commit
a67b6acc52
@ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver,
|
||||
* (Optional) Sets the solver type for solving local linear systems in FSAI. This
|
||||
* option makes sense only for GPU runs.
|
||||
*
|
||||
* - 0: Gauss-Jordan solver
|
||||
* - 1: Vendor solver (cuSOLVER/rocSOLVER)
|
||||
* - 2: MAGMA solver
|
||||
**/
|
||||
|
||||
@ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void )
|
||||
{
|
||||
fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1;
|
||||
}
|
||||
fsai_local_solve_type = 1;
|
||||
fsai_max_steps = 5;
|
||||
fsai_max_step_size = 3;
|
||||
fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size;
|
||||
fsai_num_levels = 2;
|
||||
fsai_local_solve_type = 0;
|
||||
fsai_max_steps = 4;
|
||||
fsai_max_step_size = 2;
|
||||
fsai_max_nnz_row = 8;
|
||||
fsai_num_levels = 0;
|
||||
fsai_threshold = 0.01;
|
||||
fsai_eig_maxiter = 5;
|
||||
fsai_kap_tolerance = 0.001;
|
||||
|
||||
@ -45,7 +45,7 @@ hypre_FSAICreate( void )
|
||||
fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST);
|
||||
|
||||
/* setup params */
|
||||
local_solve_type = 1;
|
||||
local_solve_type = 0;
|
||||
max_steps = 3;
|
||||
max_step_size = 5;
|
||||
max_nnz_row = max_steps * max_step_size;
|
||||
|
||||
@ -11,12 +11,121 @@
|
||||
|
||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||
|
||||
#define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j]
|
||||
#define rhs_(ldim, i, j) rhs_data[ldim * i + j]
|
||||
#define sol_(ldim, i, j) sol_data[ldim * i + j]
|
||||
#define mat_(l, k, i, j) mat_data[l * (l * k + i) + j]
|
||||
#define rhs_(l, i, j) rhs_data[l * i + j]
|
||||
#define sol_(l, i, j) sol_data[l * i + j]
|
||||
#define ls_(i, j) ls_data[batch_dim * j + i]
|
||||
|
||||
#define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C))
|
||||
|
||||
/*--------------------------------------------------------------------------
|
||||
* hypreGPUKernel_BatchedGaussJordanSolve
|
||||
*--------------------------------------------------------------------------*/
|
||||
|
||||
__global__ void
|
||||
__launch_bounds__(1024, 1)
|
||||
hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem &item,
|
||||
HYPRE_Int batch_num_items,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *mat_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Complex *sol_data )
|
||||
{
|
||||
extern __shared__ void* shmem[];
|
||||
|
||||
HYPRE_Complex *ls_data = (HYPRE_Complex*) shmem;
|
||||
HYPRE_Complex *coef = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1));
|
||||
HYPRE_Int *pos = (HYPRE_Int*) (coef + 2);
|
||||
|
||||
HYPRE_Int tidx = threadIdx.x;
|
||||
HYPRE_Int tidy = threadIdx.y;
|
||||
HYPRE_Int btid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
HYPRE_Int i, k;
|
||||
HYPRE_Int posA;
|
||||
HYPRE_Complex coefA, coefB;
|
||||
HYPRE_Complex *ptrA;
|
||||
|
||||
if (btid < batch_num_items)
|
||||
{
|
||||
/* Shift to LS belonging to the current batch ID (btid) */
|
||||
mat_data += btid * batch_dim * batch_dim;
|
||||
rhs_data += btid * batch_dim;
|
||||
sol_data += btid * batch_dim;
|
||||
|
||||
/* Copy matrix into shared memory */
|
||||
if (tidy < batch_dim)
|
||||
{
|
||||
ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx];
|
||||
}
|
||||
|
||||
/* Copy RHS into shared memory */
|
||||
if (tidy == batch_dim)
|
||||
{
|
||||
ls_(tidx, tidy) = rhs_data[tidx];
|
||||
}
|
||||
|
||||
/* Perform elimination */
|
||||
for (k = 0; k < batch_dim; k++)
|
||||
{
|
||||
/* Pivot computation */
|
||||
__syncthreads();
|
||||
if ((tidx < 2) && (tidy == 0))
|
||||
{
|
||||
i = k + 1 + tidx;
|
||||
posA = k;
|
||||
ptrA = &ls_(i, k);
|
||||
coefA = fabs(ls_(k, k));
|
||||
|
||||
#pragma unroll 1
|
||||
for (; i < batch_dim; i += 2)
|
||||
{
|
||||
coefB = fabs(*ptrA);
|
||||
if (coefA < coefB)
|
||||
{
|
||||
coefA = coefB;
|
||||
posA = i;
|
||||
}
|
||||
ptrA += 2;
|
||||
}
|
||||
pos[tidx] = posA;
|
||||
coef[tidx] = coefA;
|
||||
}
|
||||
|
||||
/* Swap row coefficients */
|
||||
__syncthreads();
|
||||
if ((tidx == k) && (tidy >= k))
|
||||
{
|
||||
posA = (coef[1] > coef[0]) ? pos[1] : pos[0];
|
||||
|
||||
coefA = ls_(posA, tidy);
|
||||
ls_(posA, tidy) = ls_(tidx, tidy);
|
||||
ls_(tidx, tidy) = coefA;
|
||||
}
|
||||
|
||||
/* Row scaling */
|
||||
__syncthreads();
|
||||
if ((tidx == k) && (tidy > k))
|
||||
{
|
||||
ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k));
|
||||
}
|
||||
|
||||
/* Row elimination */
|
||||
__syncthreads();
|
||||
if ((tidx != k) && (tidy > k))
|
||||
{
|
||||
ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
if (tidy == batch_dim)
|
||||
{
|
||||
sol_data[tidx] = ls_(tidx, batch_dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------
|
||||
* hypreGPUKernel_FSAIExtractSubSystems
|
||||
*
|
||||
@ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
||||
HYPRE_Int *P_i,
|
||||
HYPRE_Int *P_e,
|
||||
HYPRE_Int *P_j,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *mat_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Int *G_r )
|
||||
@ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
||||
i += hypre_gpu_get_grid_num_warps<1, 1>(item))
|
||||
{
|
||||
/* Set identity matrix */
|
||||
for (j = lane; j < ldim; j += HYPRE_WARP_SIZE)
|
||||
for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE)
|
||||
{
|
||||
mat_(ldim, i, j, j) = 1.0;
|
||||
mat_(batch_dim, i, j, j) = 1.0;
|
||||
}
|
||||
|
||||
if (lane == 0)
|
||||
@ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
||||
{
|
||||
if (lane == (hypre_ffs(bitmask) - 1))
|
||||
{
|
||||
rhs_(ldim, i, j - pj) = - read_only_load(A_a + k);
|
||||
rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
||||
if (lane == (hypre_ffs(bitmask) - 1))
|
||||
{
|
||||
val = read_only_load(A_a + k);
|
||||
mat_(ldim, i, j - pj, jj - pj) = val;
|
||||
mat_(ldim, i, jj - pj, j - pj) = val;
|
||||
mat_(batch_dim, i, j - pj, jj - pj) = val;
|
||||
mat_(batch_dim, i, jj - pj, j - pj) = val;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
||||
__global__ void
|
||||
hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
||||
HYPRE_Int num_rows,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *sol_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Complex *scaling,
|
||||
@ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
||||
i += hypre_gpu_get_grid_num_threads<1, 1>(item))
|
||||
{
|
||||
val = scaling[i];
|
||||
for (j = 0; j < ldim; j++)
|
||||
for (j = 0; j < batch_dim; j++)
|
||||
{
|
||||
val += sol_(ldim, i, j) * rhs_(ldim, i, j);
|
||||
val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j);
|
||||
}
|
||||
|
||||
if (val > 0)
|
||||
@ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
||||
__global__ void
|
||||
hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
|
||||
HYPRE_Int num_rows,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *sol_data,
|
||||
HYPRE_Complex *scaling,
|
||||
HYPRE_Int *K_i,
|
||||
@ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
|
||||
col = K_j[j];
|
||||
|
||||
G_j[cnt + il] = col;
|
||||
G_a[cnt + il] = sol_(ldim, i, il) * val;
|
||||
G_a[cnt + il] = sol_(batch_dim, i, il) * val;
|
||||
il++;
|
||||
}
|
||||
}
|
||||
@ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item,
|
||||
}
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------
|
||||
* hypre_BatchedGaussJordanSolveDevice
|
||||
*
|
||||
* Solve dense linear systems with less than 32 unknowns via Gauss-Jordan
|
||||
* elimination.
|
||||
*--------------------------------------------------------------------------*/
|
||||
|
||||
HYPRE_Int
|
||||
hypre_BatchedGaussJordanSolveDevice( HYPRE_Int batch_num_items,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *mat_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Complex *sol_data )
|
||||
{
|
||||
if (batch_dim > 31)
|
||||
{
|
||||
hypre_error_w_msg(HYPRE_ERROR_GENERIC,
|
||||
"Error: cannot solve for local systems larger than 31.");
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
/* Assign one linear system per thread block*/
|
||||
dim3 bDim = hypre_dim3(batch_dim, batch_dim + 1, 1);
|
||||
dim3 gDim = hypre_dim3(batch_num_items, 1, 1);
|
||||
HYPRE_Int shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) +
|
||||
sizeof(HYPRE_Int) * 2);
|
||||
|
||||
HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size,
|
||||
batch_num_items, batch_dim, mat_data, rhs_data, sol_data);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------
|
||||
* hypre_FSAIExtractSubSystemsDevice
|
||||
*
|
||||
@ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
||||
HYPRE_Int *P_i,
|
||||
HYPRE_Int *P_e,
|
||||
HYPRE_Int *P_j,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *mat_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Int *G_r )
|
||||
@ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim);
|
||||
|
||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows,
|
||||
A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r );
|
||||
A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
||||
|
||||
HYPRE_Int
|
||||
hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *sol_data,
|
||||
HYPRE_Complex *rhs_data,
|
||||
HYPRE_Complex *scaling,
|
||||
@ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||
|
||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim,
|
||||
num_rows, ldim, sol_data, rhs_data, scaling, info );
|
||||
num_rows, batch_dim, sol_data, rhs_data, scaling, info );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
||||
|
||||
HYPRE_Int
|
||||
hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows,
|
||||
HYPRE_Int ldim,
|
||||
HYPRE_Int batch_dim,
|
||||
HYPRE_Complex *sol_data,
|
||||
HYPRE_Complex *scaling,
|
||||
HYPRE_Int *K_i,
|
||||
@ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows,
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||
|
||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim,
|
||||
num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
|
||||
num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
||||
HYPRE_Int block_size = max_nnz_row * max_nnz_row;
|
||||
HYPRE_Int num_nonzeros_G;
|
||||
|
||||
HYPRE_Complex **sol_aop = NULL;
|
||||
HYPRE_Complex **mat_aop = NULL;
|
||||
|
||||
hypre_ParCSRMatrix *Atilde;
|
||||
hypre_ParCSRMatrix *B;
|
||||
hypre_ParCSRMatrix *Ktilde;
|
||||
@ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
||||
return hypre_error_flag;
|
||||
#endif
|
||||
}
|
||||
else if (local_solve_type == 0)
|
||||
{
|
||||
if (max_nnz_row > 31)
|
||||
{
|
||||
hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n");
|
||||
@ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
||||
hypre_GpuProfilingPopRange();
|
||||
|
||||
/* Build array of pointers */
|
||||
if (local_solve_type != 0)
|
||||
{
|
||||
hypre_GpuProfilingPushRange("Storage2");
|
||||
HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||
HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||
sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||
mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||
hypre_GpuProfilingPopRange();
|
||||
|
||||
hypre_GpuProfilingPushRange("FormAOP");
|
||||
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
|
||||
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
|
||||
hypre_GpuProfilingPopRange();
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------
|
||||
* Solve local linear systems
|
||||
@ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
||||
|
||||
hypre_GpuProfilingPushRange("Solve");
|
||||
|
||||
if (local_solve_type == 1)
|
||||
if (local_solve_type == 0)
|
||||
{
|
||||
hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data);
|
||||
}
|
||||
else if (local_solve_type == 1)
|
||||
{
|
||||
#if defined (HYPRE_USING_CUSOLVER)
|
||||
HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle,
|
||||
|
||||
@ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck()
|
||||
const hypre_int cuda_arch_actual_minor = cuda_arch_actual % 100;
|
||||
const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100;
|
||||
|
||||
if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor)
|
||||
if (cuda_arch_actual_major != cuda_arch_compile_major ||
|
||||
cuda_arch_actual_minor < cuda_arch_compile_minor)
|
||||
{
|
||||
char msg[256];
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user