Switch to HYPRE_GPU_LAUNCH

This commit is contained in:
Wayne Mitchell 2022-02-15 20:01:08 +00:00
parent 7883708a4f
commit 4136c63269
30 changed files with 128 additions and 132 deletions

View File

@ -164,7 +164,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix,
/* mark unwanted elements as -1 */
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
(HYPRE_Int *) row_indexes, ncols, indicator );
auto new_end = HYPRE_THRUST_CALL(
@ -233,7 +233,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big
/*
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
*/
/* output X: 0: keep, 1: zero-out */

View File

@ -251,7 +251,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
/* set/add to local vector */
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
vec_start, new_sora,
hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );

View File

@ -627,12 +627,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );
gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
Pi_diag_data );
}
@ -693,12 +693,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );
gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
Pi_offd_data );
}
@ -907,7 +907,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
Pix_diag_data, Piy_diag_data, Piz_diag_data );
}
@ -987,7 +987,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
Pix_offd_data, Piy_offd_data, Piz_offd_data );
}

View File

@ -496,7 +496,7 @@ HYPRE_Int hypre_AMESetup(void *esolver)
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
}
else

View File

@ -194,7 +194,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
}
else
@ -241,7 +241,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
}
else
@ -456,7 +456,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
//hypre_SyncComputeStream(hypre_handle());
@ -787,7 +787,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
}
else
@ -1623,12 +1623,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_diag_nnz, dim, G_diag_J, Pi_diag_J );
gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
Pi_diag_data );
}
@ -1696,12 +1696,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_offd_nnz, dim, G_offd_J, Pi_offd_J );
gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
Pi_offd_data );
}
@ -1944,7 +1944,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
Pix_diag_data, Piy_diag_data, Piz_diag_data );
}
@ -2010,7 +2010,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
Pix_diag_data, Piy_diag_data, NULL );
}
@ -2068,7 +2068,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
Pix_diag_data, NULL, NULL );
}
@ -2145,7 +2145,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
Pix_offd_data, Piy_offd_data, Piz_offd_data );
}
@ -2227,7 +2227,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
Pix_offd_data, Piy_offd_data, NULL );
}
@ -2299,7 +2299,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
Pix_offd_data, NULL, NULL );
}
@ -2501,12 +2501,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_diag_nnz, dim, G_diag_J, GPi_diag_J );
gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
GPi_diag_data );
}
@ -2575,12 +2575,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_offd_nnz, dim, G_offd_J, GPi_offd_J );
gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
GPi_offd_data );
}
@ -2815,7 +2815,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
}
else
@ -3401,7 +3401,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
Gx_data, Gy_data, Gz_data );
}

View File

@ -109,7 +109,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
/* only for rows corresponding to F2 (notice flag == -1) */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_local,
A_offd_nnz > 0,
@ -160,7 +160,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
gDim, bDim,
AF2F_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
@ -329,7 +329,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
gDim, bDim,
AFC_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)),
@ -388,7 +388,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
/* only for rows corresponding to F2 (notice flag == -1) */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_local,
A_offd_nnz > 0,
@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
gDim, bDim,
AF2F_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),

View File

@ -331,7 +331,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */
HYPRE_Int *new_end;
/* init CF_marker_diag and measure_diag: remove some special nodes */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag );
/* communicate for measure_offd */
@ -494,7 +494,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
gDim, bDim,
graph_diag_size,
graph_diag,

View File

@ -424,7 +424,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real
dim3 bDim(BLOCK_SIZE, 1, 1);
dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
return hypre_error_flag;
}

View File

@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S,
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
graph_diag_size, graph_diag, measure_diag, measure_offd,
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
IS_marker_diag, IS_marker_offd, IS_offd_temp_mark );
@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S,
/* adjust IS_marker_diag from the received */
gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
IS_marker_diag, num_elmts_send, send_map_elmts,
int_send_buf, IS_offd_temp_mark );

View File

@ -197,7 +197,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
CF_marker, CF_marker_offd, num_functions,
dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
@ -228,7 +228,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
if (interp_type == 3)
{
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
n_fine, A_diag_i, A_diag_j, A_diag_data,
A_offd_i, A_offd_j, A_offd_data,
hypre_ParCSRMatrixSocDiagJ(S),
@ -241,7 +241,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
}
else
{
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
n_fine, A_diag_i, A_diag_j, A_diag_data,
A_offd_i, A_offd_j, A_offd_data,
hypre_ParCSRMatrixSocDiagJ(S),
@ -1161,7 +1161,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);

View File

@ -162,7 +162,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a );
/* build new P_diag and P_offd */

View File

@ -87,7 +87,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_of_rows,
A_offd_nnz > 0,
@ -128,7 +128,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A,
/* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
hypre_GpuProfilingPushRange("Compute interp matrix");
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc,
gDim, bDim,
W_nr_of_rows,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@ -273,7 +273,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_of_rows,
A_offd_nnz > 0,
@ -352,7 +352,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A,
hypre_GpuProfilingPushRange("Compute interp matrix");
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
gDim, bDim,
W_nr_of_rows,
hypre_ParCSRMatrixFirstRowIndex(AFF),
@ -502,7 +502,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_of_rows,
A_offd_nnz > 0,
@ -545,7 +545,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
hypre_GpuProfilingPushRange("Compute D_tmp");
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
gDim, bDim,
W_nr_of_rows,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@ -587,7 +587,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
/* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
hypre_GpuProfilingPushRange("Compute interp matrix");
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
gDim, bDim,
W_nr_of_rows,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),

View File

@ -254,7 +254,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix *A,
/* assemble the diagonal part of R from Z */
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a);
num_cols_offd_R = num_cols_offd_Z;

View File

@ -343,7 +343,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
dim3 gDim = hypre_GetDefaultDeviceGridDimension(remaining, "warp", bDim);
/* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count,
gDim, bDim,
remaining,
current_pass,
@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
n_fine, A_diag_i, A_diag_j, A_diag_data,
A_offd_i, A_offd_j, A_offd_data,
CF_marker,
@ -591,7 +591,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
pass_starts[p + 1], pass_starts[p + 2], pass_order,
Pi_diag_i, Pi_diag_j, Pi_diag_data,
P_diag_i, P_diag_j, P_diag_data,
@ -654,7 +654,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(npoints, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
pass_starts[p + 1],
pass_starts[p + 2],
pass_order,
@ -893,7 +893,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
num_points, color, pass_order, pass_marker, pass_marker_offd,
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
P_diag_i, P_offd_i );
@ -921,7 +921,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
num_points,
color,
pass_order,
@ -1144,7 +1144,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
num_points, color, pass_order, pass_marker, pass_marker_offd,
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
Q_diag_i, Q_offd_i );
@ -1173,7 +1173,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
num_points,
color,
pass_order,
@ -1244,7 +1244,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
num_points, pass_order, A_diag_i, A_diag_data,
Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data,
w_row_sum );

View File

@ -155,7 +155,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
gDim,
bDim,
A_num_rows,

View File

@ -140,7 +140,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
if (abs_soc)
{
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
num_variables, max_row_sum, strength_threshold,
A_diag_data, A_diag_i, A_diag_j,
A_offd_data, A_offd_i, A_offd_j,
@ -150,7 +150,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
}
else
{
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
num_variables, max_row_sum, strength_threshold,
A_diag_data, A_diag_i, A_diag_j,
A_offd_data, A_offd_i, A_offd_j,

View File

@ -628,7 +628,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A)
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
gDim, bDim,
hypre_CSRMatrixNumRows(A_diag),
hypre_CSRMatrixNumCols(A_diag),
@ -745,7 +745,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
gDim, bDim,
hypre_CSRMatrixNumRows(A_diag),
hypre_CSRMatrixNumCols(A_diag),
@ -777,7 +777,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag));
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
gDim, bDim,
hypre_CSRMatrixNumRows(E_diag),
hypre_CSRMatrixNumCols(E_diag),
@ -1211,21 +1211,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A,
if (type == -1)
{
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
}
if (type == 1)
{
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
}
if (type == 2)
{
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);

View File

@ -714,7 +714,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A )
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
nrows, A_i, A_j, A_data);
hypre_SyncComputeStream(hypre_handle());
@ -751,7 +751,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
hypre_CSRMatrixNumRows(A),
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
@ -845,7 +845,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
HYPRE_Int *result = NULL;
#endif
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
v, hypre_CSRMatrixNumRows(A),
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
tol, result );
@ -940,7 +940,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
HYPRE_Int *result = NULL;
#endif
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
new_diag, v, hypre_CSRMatrixNumRows(A),
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
tol, result );
@ -1135,17 +1135,17 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
if (type == 0)
{
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
row_sum, scal, set_or_add[0] == 's' );
}
else if (type == 1)
{
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
row_sum, scal, set_or_add[0] == 's' );
}
else if (type == 2)
{
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
row_sum, scal, set_or_add[0] == 's' );
}
@ -1240,7 +1240,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
hypre_SyncComputeStream(hypre_handle());
}
@ -1560,7 +1560,7 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
@ -1640,7 +1640,7 @@ hypre_CSRMatrixDiagScaleDevice( hypre_CSRMatrix *A,
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRDiagScale, gDim, bDim,
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRDiagScale, gDim, bDim,
nrows, A_i, A_j, A_data, ldata, rdata);
hypre_SyncComputeStream(hypre_handle());

View File

@ -506,7 +506,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
// for cases where one WARP works on a row
dim3 gDim( (m + bDim.z - 1) / bDim.z );
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
gDim, bDim, /* shmem_size, */
m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a,
d_rc, d_rf );
@ -542,7 +542,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
// for cases where one WARP works on a row
dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z );
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
gDim, bDim, /* shmem_size, */
num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j,
d_ghash2_a,
@ -563,7 +563,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
// for cases where one WARP works on a row
dim3 gDim( (m + bDim.z - 1) / bDim.z );
HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
bDim,
m, d_rf,
d_js, d_as,

View File

@ -467,7 +467,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume);
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
hash_type > ),
gDim, bDim, /* shmem_size, */
m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc,
@ -493,7 +493,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
/* copy to the final C */
dim3 gDim( (m + bDim.z - 1) / bDim.z );
HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new );
hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE);

View File

@ -313,19 +313,19 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int m,
* ---------------------------------------------------------------------------*/
if (hash_type == 'L')
{
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
gDim, bDim,
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
}
else if (hash_type == 'Q')
{
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
gDim, bDim,
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
}
else if (hash_type == 'D')
{
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
gDim, bDim,
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
}

View File

@ -287,11 +287,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) );
HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
/* step-1: layer 3-2 */
gDim.x = (K + bDim.z - 1) / bDim.z;
HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
bDim,
K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0);
@ -301,7 +301,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
d_V3 = (T*) d_rc;
gDim.x = (M + bDim.z - 1) / bDim.z;
HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
bDim,
M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor);
@ -336,13 +336,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
if (row_est_mtd == 1)
{
/* naive overestimate */
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc );
}
else if (row_est_mtd == 2)
{
/* naive underestimate */
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL );
}
else if (row_est_mtd == 3)
@ -361,7 +361,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
HYPRE_Int *d_low = d_low_upp;
HYPRE_Int *d_upp = d_low_upp + m;
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp );
/* Cohen's algorithm, stochastic approach */

View File

@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int num_rows, /* number of
{
ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
}
else if (type == 2)
{
ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
}

View File

@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
const HYPRE_Int group_size = 32;
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
}
else if (rownnz >= 32)
@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
const HYPRE_Int group_size = 16;
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
}
else if (rownnz >= 16)
@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
const HYPRE_Int group_size = 8;
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
}
else if (rownnz >= 8)
@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
const HYPRE_Int group_size = 4;
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
}
else
@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
const HYPRE_Int group_size = 4;
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
}

View File

@ -810,7 +810,7 @@ extern "C++"
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
}
}
@ -871,7 +871,7 @@ extern "C++"
hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
*/
HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
}
}

View File

@ -76,7 +76,7 @@ extern "C++"
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
}
}
@ -137,7 +137,7 @@ extern "C++"
hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
*/
HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
}
}

View File

@ -130,8 +130,6 @@ struct hypre_device_allocator
#include <oneapi/mkl/rng.hpp>
#endif
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
#endif // defined(HYPRE_USING_CUDA)
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -179,7 +177,7 @@ struct hypre_device_allocator
func_name(oneapi::dpl::execution::make_device_policy( \
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \
{ \
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
{ \
@ -496,7 +494,7 @@ using namespace thrust::placeholders;
#define GPU_LAUNCH_SYNC
#endif // defined(HYPRE_DEBUG)
#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
{ \
if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \
blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \
@ -512,7 +510,7 @@ using namespace thrust::placeholders;
} \
}
#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
* The following one works OK for now */
@ -1375,7 +1373,7 @@ struct ReduceSum
/* 2nd reduction with only *one* block */
hypre_assert(nblocks >= 0 && nblocks <= 1024);
const dim3 gDim(1), bDim(1024);
HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
val += init;
}

View File

@ -267,7 +267,7 @@ struct ReduceSum
/* 2nd reduction with only *one* block */
hypre_assert(nblocks >= 0 && nblocks <= 1024);
const dim3 gDim(1), bDim(1024);
HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
val += init;
}

View File

@ -68,7 +68,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y );
HYPRE_GPU_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y );
return hypre_error_flag;
}
@ -103,7 +103,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
HYPRE_GPU_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
return hypre_error_flag;
}
@ -156,7 +156,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind );
HYPRE_GPU_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind );
HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
oneapi::dpl::maximum<HYPRE_Int>());
@ -234,7 +234,7 @@ void hypre_CudaCompileFlagCheck()
HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE,
HYPRE_MEMORY_HOST);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST,
HYPRE_MEMORY_DEVICE);
//hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
@ -345,7 +345,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
return hypre_error_flag;
}
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
d_offd_ia, d_rownnz );
return hypre_error_flag;
@ -485,7 +485,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows,
}
*/
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
d_diag_i, d_diag_j, d_diag_a,
d_offd_i, d_offd_j, d_offd_a,
@ -696,7 +696,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
/* trivial cases, n = 1, 2 */
dim3 bDim = 1;
dim3 gDim = 1;
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
}
else
{
@ -735,7 +735,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
reduced_n, x, reduced_map, reduced_y );
if (!work)
@ -778,7 +778,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
return hypre_error_flag;
}
@ -812,7 +812,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
return hypre_error_flag;
}
@ -846,7 +846,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
return hypre_error_flag;
}
@ -885,7 +885,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
return hypre_error_flag;
}
@ -920,7 +920,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
return hypre_error_flag;
}
@ -944,7 +944,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
return hypre_error_flag;
}

View File

@ -73,8 +73,6 @@
#include <oneapi/mkl/rng.hpp>
#endif
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
#endif // defined(HYPRE_USING_CUDA)
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -122,7 +120,7 @@
func_name(oneapi::dpl::execution::make_device_policy( \
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \
{ \
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
{ \
@ -439,7 +437,7 @@ using namespace thrust::placeholders;
#define GPU_LAUNCH_SYNC
#endif // defined(HYPRE_DEBUG)
#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
{ \
if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \
blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \
@ -455,7 +453,7 @@ using namespace thrust::placeholders;
} \
}
#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
* The following one works OK for now */