Improve FSAI performance for GPUs (#1010)
* Add Gauss-Jordan solve implementation for GPUs * Improve FSAI defaults * Update description of HYPRE_FSAISetAlgoType
This commit is contained in:
parent
c215800934
commit
a67b6acc52
@ -1647,6 +1647,7 @@ HYPRE_Int HYPRE_FSAISetAlgoType( HYPRE_Solver solver,
|
|||||||
* (Optional) Sets the solver type for solving local linear systems in FSAI. This
|
* (Optional) Sets the solver type for solving local linear systems in FSAI. This
|
||||||
* option makes sense only for GPU runs.
|
* option makes sense only for GPU runs.
|
||||||
*
|
*
|
||||||
|
* - 0: Gauss-Jordan solver
|
||||||
* - 1: Vendor solver (cuSOLVER/rocSOLVER)
|
* - 1: Vendor solver (cuSOLVER/rocSOLVER)
|
||||||
* - 2: MAGMA solver
|
* - 2: MAGMA solver
|
||||||
**/
|
**/
|
||||||
|
|||||||
@ -235,11 +235,11 @@ hypre_BoomerAMGCreate( void )
|
|||||||
{
|
{
|
||||||
fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1;
|
fsai_algo_type = hypre_NumThreads() > 4 ? 2 : 1;
|
||||||
}
|
}
|
||||||
fsai_local_solve_type = 1;
|
fsai_local_solve_type = 0;
|
||||||
fsai_max_steps = 5;
|
fsai_max_steps = 4;
|
||||||
fsai_max_step_size = 3;
|
fsai_max_step_size = 2;
|
||||||
fsai_max_nnz_row = fsai_max_steps * fsai_max_step_size;
|
fsai_max_nnz_row = 8;
|
||||||
fsai_num_levels = 2;
|
fsai_num_levels = 0;
|
||||||
fsai_threshold = 0.01;
|
fsai_threshold = 0.01;
|
||||||
fsai_eig_maxiter = 5;
|
fsai_eig_maxiter = 5;
|
||||||
fsai_kap_tolerance = 0.001;
|
fsai_kap_tolerance = 0.001;
|
||||||
|
|||||||
@ -45,7 +45,7 @@ hypre_FSAICreate( void )
|
|||||||
fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST);
|
fsai_data = hypre_CTAlloc(hypre_ParFSAIData, 1, HYPRE_MEMORY_HOST);
|
||||||
|
|
||||||
/* setup params */
|
/* setup params */
|
||||||
local_solve_type = 1;
|
local_solve_type = 0;
|
||||||
max_steps = 3;
|
max_steps = 3;
|
||||||
max_step_size = 5;
|
max_step_size = 5;
|
||||||
max_nnz_row = max_steps * max_step_size;
|
max_nnz_row = max_steps * max_step_size;
|
||||||
|
|||||||
@ -11,12 +11,121 @@
|
|||||||
|
|
||||||
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
|
||||||
|
|
||||||
#define mat_(ldim, k, i, j) mat_data[ldim * (ldim * k + i) + j]
|
#define mat_(l, k, i, j) mat_data[l * (l * k + i) + j]
|
||||||
#define rhs_(ldim, i, j) rhs_data[ldim * i + j]
|
#define rhs_(l, i, j) rhs_data[l * i + j]
|
||||||
#define sol_(ldim, i, j) sol_data[ldim * i + j]
|
#define sol_(l, i, j) sol_data[l * i + j]
|
||||||
|
#define ls_(i, j) ls_data[batch_dim * j + i]
|
||||||
|
|
||||||
#define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C))
|
#define HYPRE_THRUST_ZIP3(A, B, C) thrust::make_zip_iterator(thrust::make_tuple(A, B, C))
|
||||||
|
|
||||||
|
/*--------------------------------------------------------------------------
|
||||||
|
* hypreGPUKernel_BatchedGaussJordanSolve
|
||||||
|
*--------------------------------------------------------------------------*/
|
||||||
|
|
||||||
|
__global__ void
|
||||||
|
__launch_bounds__(1024, 1)
|
||||||
|
hypreGPUKernel_BatchedGaussJordanSolve( hypre_DeviceItem &item,
|
||||||
|
HYPRE_Int batch_num_items,
|
||||||
|
HYPRE_Int batch_dim,
|
||||||
|
HYPRE_Complex *mat_data,
|
||||||
|
HYPRE_Complex *rhs_data,
|
||||||
|
HYPRE_Complex *sol_data )
|
||||||
|
{
|
||||||
|
extern __shared__ void* shmem[];
|
||||||
|
|
||||||
|
HYPRE_Complex *ls_data = (HYPRE_Complex*) shmem;
|
||||||
|
HYPRE_Complex *coef = (HYPRE_Complex*) (ls_data + batch_dim * (batch_dim + 1));
|
||||||
|
HYPRE_Int *pos = (HYPRE_Int*) (coef + 2);
|
||||||
|
|
||||||
|
HYPRE_Int tidx = threadIdx.x;
|
||||||
|
HYPRE_Int tidy = threadIdx.y;
|
||||||
|
HYPRE_Int btid = blockIdx.y * gridDim.x + blockIdx.x;
|
||||||
|
|
||||||
|
HYPRE_Int i, k;
|
||||||
|
HYPRE_Int posA;
|
||||||
|
HYPRE_Complex coefA, coefB;
|
||||||
|
HYPRE_Complex *ptrA;
|
||||||
|
|
||||||
|
if (btid < batch_num_items)
|
||||||
|
{
|
||||||
|
/* Shift to LS belonging to the current batch ID (btid) */
|
||||||
|
mat_data += btid * batch_dim * batch_dim;
|
||||||
|
rhs_data += btid * batch_dim;
|
||||||
|
sol_data += btid * batch_dim;
|
||||||
|
|
||||||
|
/* Copy matrix into shared memory */
|
||||||
|
if (tidy < batch_dim)
|
||||||
|
{
|
||||||
|
ls_(tidx, tidy) = mat_data[tidy * batch_dim + tidx];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy RHS into shared memory */
|
||||||
|
if (tidy == batch_dim)
|
||||||
|
{
|
||||||
|
ls_(tidx, tidy) = rhs_data[tidx];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Perform elimination */
|
||||||
|
for (k = 0; k < batch_dim; k++)
|
||||||
|
{
|
||||||
|
/* Pivot computation */
|
||||||
|
__syncthreads();
|
||||||
|
if ((tidx < 2) && (tidy == 0))
|
||||||
|
{
|
||||||
|
i = k + 1 + tidx;
|
||||||
|
posA = k;
|
||||||
|
ptrA = &ls_(i, k);
|
||||||
|
coefA = fabs(ls_(k, k));
|
||||||
|
|
||||||
|
#pragma unroll 1
|
||||||
|
for (; i < batch_dim; i += 2)
|
||||||
|
{
|
||||||
|
coefB = fabs(*ptrA);
|
||||||
|
if (coefA < coefB)
|
||||||
|
{
|
||||||
|
coefA = coefB;
|
||||||
|
posA = i;
|
||||||
|
}
|
||||||
|
ptrA += 2;
|
||||||
|
}
|
||||||
|
pos[tidx] = posA;
|
||||||
|
coef[tidx] = coefA;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Swap row coefficients */
|
||||||
|
__syncthreads();
|
||||||
|
if ((tidx == k) && (tidy >= k))
|
||||||
|
{
|
||||||
|
posA = (coef[1] > coef[0]) ? pos[1] : pos[0];
|
||||||
|
|
||||||
|
coefA = ls_(posA, tidy);
|
||||||
|
ls_(posA, tidy) = ls_(tidx, tidy);
|
||||||
|
ls_(tidx, tidy) = coefA;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Row scaling */
|
||||||
|
__syncthreads();
|
||||||
|
if ((tidx == k) && (tidy > k))
|
||||||
|
{
|
||||||
|
ls_(tidx, tidy) = ls_(tidx, tidy) * (1.0 / ls_(tidx, k));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Row elimination */
|
||||||
|
__syncthreads();
|
||||||
|
if ((tidx != k) && (tidy > k))
|
||||||
|
{
|
||||||
|
ls_(tidx, tidy) -= ls_(tidx, k) * ls_(k, tidy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
if (tidy == batch_dim)
|
||||||
|
{
|
||||||
|
sol_data[tidx] = ls_(tidx, batch_dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*--------------------------------------------------------------------
|
/*--------------------------------------------------------------------
|
||||||
* hypreGPUKernel_FSAIExtractSubSystems
|
* hypreGPUKernel_FSAIExtractSubSystems
|
||||||
*
|
*
|
||||||
@ -38,7 +147,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
|||||||
HYPRE_Int *P_i,
|
HYPRE_Int *P_i,
|
||||||
HYPRE_Int *P_e,
|
HYPRE_Int *P_e,
|
||||||
HYPRE_Int *P_j,
|
HYPRE_Int *P_j,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *mat_data,
|
HYPRE_Complex *mat_data,
|
||||||
HYPRE_Complex *rhs_data,
|
HYPRE_Complex *rhs_data,
|
||||||
HYPRE_Int *G_r )
|
HYPRE_Int *G_r )
|
||||||
@ -57,9 +166,9 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
|||||||
i += hypre_gpu_get_grid_num_warps<1, 1>(item))
|
i += hypre_gpu_get_grid_num_warps<1, 1>(item))
|
||||||
{
|
{
|
||||||
/* Set identity matrix */
|
/* Set identity matrix */
|
||||||
for (j = lane; j < ldim; j += HYPRE_WARP_SIZE)
|
for (j = lane; j < batch_dim; j += HYPRE_WARP_SIZE)
|
||||||
{
|
{
|
||||||
mat_(ldim, i, j, j) = 1.0;
|
mat_(batch_dim, i, j, j) = 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lane == 0)
|
if (lane == 0)
|
||||||
@ -104,7 +213,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
|||||||
{
|
{
|
||||||
if (lane == (hypre_ffs(bitmask) - 1))
|
if (lane == (hypre_ffs(bitmask) - 1))
|
||||||
{
|
{
|
||||||
rhs_(ldim, i, j - pj) = - read_only_load(A_a + k);
|
rhs_(batch_dim, i, j - pj) = - read_only_load(A_a + k);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -149,8 +258,8 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
|||||||
if (lane == (hypre_ffs(bitmask) - 1))
|
if (lane == (hypre_ffs(bitmask) - 1))
|
||||||
{
|
{
|
||||||
val = read_only_load(A_a + k);
|
val = read_only_load(A_a + k);
|
||||||
mat_(ldim, i, j - pj, jj - pj) = val;
|
mat_(batch_dim, i, j - pj, jj - pj) = val;
|
||||||
mat_(ldim, i, jj - pj, j - pj) = val;
|
mat_(batch_dim, i, jj - pj, j - pj) = val;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -176,7 +285,7 @@ hypreGPUKernel_FSAIExtractSubSystems( hypre_DeviceItem &item,
|
|||||||
__global__ void
|
__global__ void
|
||||||
hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
||||||
HYPRE_Int num_rows,
|
HYPRE_Int num_rows,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *sol_data,
|
HYPRE_Complex *sol_data,
|
||||||
HYPRE_Complex *rhs_data,
|
HYPRE_Complex *rhs_data,
|
||||||
HYPRE_Complex *scaling,
|
HYPRE_Complex *scaling,
|
||||||
@ -191,9 +300,9 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
|||||||
i += hypre_gpu_get_grid_num_threads<1, 1>(item))
|
i += hypre_gpu_get_grid_num_threads<1, 1>(item))
|
||||||
{
|
{
|
||||||
val = scaling[i];
|
val = scaling[i];
|
||||||
for (j = 0; j < ldim; j++)
|
for (j = 0; j < batch_dim; j++)
|
||||||
{
|
{
|
||||||
val += sol_(ldim, i, j) * rhs_(ldim, i, j);
|
val += sol_(batch_dim, i, j) * rhs_(batch_dim, i, j);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (val > 0)
|
if (val > 0)
|
||||||
@ -222,7 +331,7 @@ hypreGPUKernel_FSAIScaling( hypre_DeviceItem &item,
|
|||||||
__global__ void
|
__global__ void
|
||||||
hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
|
hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
|
||||||
HYPRE_Int num_rows,
|
HYPRE_Int num_rows,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *sol_data,
|
HYPRE_Complex *sol_data,
|
||||||
HYPRE_Complex *scaling,
|
HYPRE_Complex *scaling,
|
||||||
HYPRE_Int *K_i,
|
HYPRE_Int *K_i,
|
||||||
@ -258,7 +367,7 @@ hypreGPUKernel_FSAIGatherEntries( hypre_DeviceItem &item,
|
|||||||
col = K_j[j];
|
col = K_j[j];
|
||||||
|
|
||||||
G_j[cnt + il] = col;
|
G_j[cnt + il] = col;
|
||||||
G_a[cnt + il] = sol_(ldim, i, il) * val;
|
G_a[cnt + il] = sol_(batch_dim, i, il) * val;
|
||||||
il++;
|
il++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -575,6 +684,39 @@ hypreGPUKernel_FSAITruncateCandidateUnordered( hypre_DeviceItem &item,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*--------------------------------------------------------------------------
|
||||||
|
* hypre_BatchedGaussJordanSolveDevice
|
||||||
|
*
|
||||||
|
* Solve dense linear systems with less than 32 unknowns via Gauss-Jordan
|
||||||
|
* elimination.
|
||||||
|
*--------------------------------------------------------------------------*/
|
||||||
|
|
||||||
|
HYPRE_Int
|
||||||
|
hypre_BatchedGaussJordanSolveDevice( HYPRE_Int batch_num_items,
|
||||||
|
HYPRE_Int batch_dim,
|
||||||
|
HYPRE_Complex *mat_data,
|
||||||
|
HYPRE_Complex *rhs_data,
|
||||||
|
HYPRE_Complex *sol_data )
|
||||||
|
{
|
||||||
|
if (batch_dim > 31)
|
||||||
|
{
|
||||||
|
hypre_error_w_msg(HYPRE_ERROR_GENERIC,
|
||||||
|
"Error: cannot solve for local systems larger than 31.");
|
||||||
|
return hypre_error_flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Assign one linear system per thread block*/
|
||||||
|
dim3 bDim = hypre_dim3(batch_dim, batch_dim + 1, 1);
|
||||||
|
dim3 gDim = hypre_dim3(batch_num_items, 1, 1);
|
||||||
|
HYPRE_Int shared_mem_size = (sizeof(HYPRE_Complex) * ((batch_dim + 1) * batch_dim + 2) +
|
||||||
|
sizeof(HYPRE_Int) * 2);
|
||||||
|
|
||||||
|
HYPRE_GPU_LAUNCH2(hypreGPUKernel_BatchedGaussJordanSolve, gDim, bDim, shared_mem_size,
|
||||||
|
batch_num_items, batch_dim, mat_data, rhs_data, sol_data);
|
||||||
|
|
||||||
|
return hypre_error_flag;
|
||||||
|
}
|
||||||
|
|
||||||
/*--------------------------------------------------------------------------
|
/*--------------------------------------------------------------------------
|
||||||
* hypre_FSAIExtractSubSystemsDevice
|
* hypre_FSAIExtractSubSystemsDevice
|
||||||
*
|
*
|
||||||
@ -590,7 +732,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
|||||||
HYPRE_Int *P_i,
|
HYPRE_Int *P_i,
|
||||||
HYPRE_Int *P_e,
|
HYPRE_Int *P_e,
|
||||||
HYPRE_Int *P_j,
|
HYPRE_Int *P_j,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *mat_data,
|
HYPRE_Complex *mat_data,
|
||||||
HYPRE_Complex *rhs_data,
|
HYPRE_Complex *rhs_data,
|
||||||
HYPRE_Int *G_r )
|
HYPRE_Int *G_r )
|
||||||
@ -605,7 +747,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
|||||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim);
|
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "warp", bDim);
|
||||||
|
|
||||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows,
|
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIExtractSubSystems, gDim, bDim, num_rows,
|
||||||
A_i, A_j, A_a, P_i, P_e, P_j, ldim, mat_data, rhs_data, G_r );
|
A_i, A_j, A_a, P_i, P_e, P_j, batch_dim, mat_data, rhs_data, G_r );
|
||||||
|
|
||||||
return hypre_error_flag;
|
return hypre_error_flag;
|
||||||
}
|
}
|
||||||
@ -616,7 +758,7 @@ hypre_FSAIExtractSubSystemsDevice( HYPRE_Int num_rows,
|
|||||||
|
|
||||||
HYPRE_Int
|
HYPRE_Int
|
||||||
hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *sol_data,
|
HYPRE_Complex *sol_data,
|
||||||
HYPRE_Complex *rhs_data,
|
HYPRE_Complex *rhs_data,
|
||||||
HYPRE_Complex *scaling,
|
HYPRE_Complex *scaling,
|
||||||
@ -632,7 +774,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
|||||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||||
|
|
||||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim,
|
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIScaling, gDim, bDim,
|
||||||
num_rows, ldim, sol_data, rhs_data, scaling, info );
|
num_rows, batch_dim, sol_data, rhs_data, scaling, info );
|
||||||
|
|
||||||
return hypre_error_flag;
|
return hypre_error_flag;
|
||||||
}
|
}
|
||||||
@ -643,7 +785,7 @@ hypre_FSAIScalingDevice( HYPRE_Int num_rows,
|
|||||||
|
|
||||||
HYPRE_Int
|
HYPRE_Int
|
||||||
hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows,
|
hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows,
|
||||||
HYPRE_Int ldim,
|
HYPRE_Int batch_dim,
|
||||||
HYPRE_Complex *sol_data,
|
HYPRE_Complex *sol_data,
|
||||||
HYPRE_Complex *scaling,
|
HYPRE_Complex *scaling,
|
||||||
HYPRE_Int *K_i,
|
HYPRE_Int *K_i,
|
||||||
@ -663,7 +805,7 @@ hypre_FSAIGatherEntriesDevice( HYPRE_Int num_rows,
|
|||||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||||
|
|
||||||
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim,
|
HYPRE_GPU_LAUNCH( hypreGPUKernel_FSAIGatherEntries, gDim, bDim,
|
||||||
num_rows, ldim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
|
num_rows, batch_dim, sol_data, scaling, K_i, K_e, K_j, G_i, G_j, G_a );
|
||||||
|
|
||||||
return hypre_error_flag;
|
return hypre_error_flag;
|
||||||
}
|
}
|
||||||
@ -737,6 +879,9 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
|||||||
HYPRE_Int block_size = max_nnz_row * max_nnz_row;
|
HYPRE_Int block_size = max_nnz_row * max_nnz_row;
|
||||||
HYPRE_Int num_nonzeros_G;
|
HYPRE_Int num_nonzeros_G;
|
||||||
|
|
||||||
|
HYPRE_Complex **sol_aop = NULL;
|
||||||
|
HYPRE_Complex **mat_aop = NULL;
|
||||||
|
|
||||||
hypre_ParCSRMatrix *Atilde;
|
hypre_ParCSRMatrix *Atilde;
|
||||||
hypre_ParCSRMatrix *B;
|
hypre_ParCSRMatrix *B;
|
||||||
hypre_ParCSRMatrix *Ktilde;
|
hypre_ParCSRMatrix *Ktilde;
|
||||||
@ -782,6 +927,13 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
|||||||
return hypre_error_flag;
|
return hypre_error_flag;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
else if (local_solve_type == 0)
|
||||||
|
{
|
||||||
|
if (max_nnz_row > 31)
|
||||||
|
{
|
||||||
|
hypre_ParFSAIDataMaxNnzRow(fsai_data) = max_nnz_row = 31;
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n");
|
hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unknown local linear solve type!\n");
|
||||||
@ -926,15 +1078,18 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
|||||||
hypre_GpuProfilingPopRange();
|
hypre_GpuProfilingPopRange();
|
||||||
|
|
||||||
/* Build array of pointers */
|
/* Build array of pointers */
|
||||||
hypre_GpuProfilingPushRange("Storage2");
|
if (local_solve_type != 0)
|
||||||
HYPRE_Complex **sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
{
|
||||||
HYPRE_Complex **mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
hypre_GpuProfilingPushRange("Storage2");
|
||||||
hypre_GpuProfilingPopRange();
|
sol_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||||
|
mat_aop = hypre_TAlloc(HYPRE_Complex *, num_rows, HYPRE_MEMORY_DEVICE);
|
||||||
|
hypre_GpuProfilingPopRange();
|
||||||
|
|
||||||
hypre_GpuProfilingPushRange("FormAOP");
|
hypre_GpuProfilingPushRange("FormAOP");
|
||||||
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
|
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, block_size, mat_data, mat_aop);
|
||||||
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
|
hypreDevice_ComplexArrayToArrayOfPtrs(num_rows, max_nnz_row, sol_data, sol_aop);
|
||||||
hypre_GpuProfilingPopRange();
|
hypre_GpuProfilingPopRange();
|
||||||
|
}
|
||||||
|
|
||||||
/*-----------------------------------------------------
|
/*-----------------------------------------------------
|
||||||
* Solve local linear systems
|
* Solve local linear systems
|
||||||
@ -995,7 +1150,11 @@ hypre_FSAISetupStaticPowerDevice( void *fsai_vdata,
|
|||||||
|
|
||||||
hypre_GpuProfilingPushRange("Solve");
|
hypre_GpuProfilingPushRange("Solve");
|
||||||
|
|
||||||
if (local_solve_type == 1)
|
if (local_solve_type == 0)
|
||||||
|
{
|
||||||
|
hypre_BatchedGaussJordanSolveDevice(num_rows, max_nnz_row, mat_data, rhs_data, sol_data);
|
||||||
|
}
|
||||||
|
else if (local_solve_type == 1)
|
||||||
{
|
{
|
||||||
#if defined (HYPRE_USING_CUSOLVER)
|
#if defined (HYPRE_USING_CUSOLVER)
|
||||||
HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle,
|
HYPRE_CUSOLVER_CALL(cusolverDnDpotrsBatched(vs_handle,
|
||||||
|
|||||||
@ -2693,7 +2693,8 @@ hypre_CudaCompileFlagCheck()
|
|||||||
const hypre_int cuda_arch_actual_minor = cuda_arch_actual % 100;
|
const hypre_int cuda_arch_actual_minor = cuda_arch_actual % 100;
|
||||||
const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100;
|
const hypre_int cuda_arch_compile_minor = cuda_arch_compile % 100;
|
||||||
|
|
||||||
if (cuda_arch_actual_major != cuda_arch_compile_major || cuda_arch_actual_minor < cuda_arch_compile_minor)
|
if (cuda_arch_actual_major != cuda_arch_compile_major ||
|
||||||
|
cuda_arch_actual_minor < cuda_arch_compile_minor)
|
||||||
{
|
{
|
||||||
char msg[256];
|
char msg[256];
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user