Switch to HYPRE_GPU_LAUNCH
This commit is contained in:
parent
7883708a4f
commit
4136c63269
@ -164,7 +164,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix,
|
||||
/* mark unwanted elements as -1 */
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
|
||||
(HYPRE_Int *) row_indexes, ncols, indicator );
|
||||
|
||||
auto new_end = HYPRE_THRUST_CALL(
|
||||
@ -233,7 +233,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big
|
||||
/*
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
|
||||
*/
|
||||
|
||||
/* output X: 0: keep, 1: zero-out */
|
||||
|
||||
@ -251,7 +251,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
|
||||
/* set/add to local vector */
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
|
||||
vec_start, new_sora,
|
||||
hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );
|
||||
|
||||
|
||||
@ -627,12 +627,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
|
||||
Pi_diag_data );
|
||||
}
|
||||
@ -693,12 +693,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
|
||||
Pi_offd_data );
|
||||
}
|
||||
@ -907,7 +907,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
|
||||
Pix_diag_data, Piy_diag_data, Piz_diag_data );
|
||||
}
|
||||
@ -987,7 +987,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
|
||||
Pix_offd_data, Piy_offd_data, Piz_offd_data );
|
||||
}
|
||||
|
||||
@ -496,7 +496,7 @@ HYPRE_Int hypre_AMESetup(void *esolver)
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
|
||||
nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
|
||||
}
|
||||
else
|
||||
|
||||
@ -194,7 +194,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
|
||||
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
|
||||
}
|
||||
else
|
||||
@ -241,7 +241,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
|
||||
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
|
||||
}
|
||||
else
|
||||
@ -456,7 +456,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
|
||||
nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
|
||||
|
||||
//hypre_SyncComputeStream(hypre_handle());
|
||||
@ -787,7 +787,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
|
||||
num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
|
||||
}
|
||||
else
|
||||
@ -1623,12 +1623,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
G_diag_nnz, dim, G_diag_J, Pi_diag_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
|
||||
Pi_diag_data );
|
||||
}
|
||||
@ -1696,12 +1696,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
G_offd_nnz, dim, G_offd_J, Pi_offd_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
|
||||
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
|
||||
Pi_offd_data );
|
||||
}
|
||||
@ -1944,7 +1944,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
|
||||
Pix_diag_data, Piy_diag_data, Piz_diag_data );
|
||||
}
|
||||
@ -2010,7 +2010,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
|
||||
Pix_diag_data, Piy_diag_data, NULL );
|
||||
}
|
||||
@ -2068,7 +2068,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
|
||||
Pix_diag_data, NULL, NULL );
|
||||
}
|
||||
@ -2145,7 +2145,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
|
||||
Pix_offd_data, Piy_offd_data, Piz_offd_data );
|
||||
}
|
||||
@ -2227,7 +2227,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
|
||||
Pix_offd_data, Piy_offd_data, NULL );
|
||||
}
|
||||
@ -2299,7 +2299,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
|
||||
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
|
||||
Pix_offd_data, NULL, NULL );
|
||||
}
|
||||
@ -2501,12 +2501,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
G_diag_nnz, dim, G_diag_J, GPi_diag_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
|
||||
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
|
||||
GPi_diag_data );
|
||||
}
|
||||
@ -2575,12 +2575,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
|
||||
G_offd_nnz, dim, G_offd_J, GPi_offd_J );
|
||||
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
|
||||
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
|
||||
GPi_offd_data );
|
||||
}
|
||||
@ -2815,7 +2815,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
|
||||
nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
|
||||
}
|
||||
else
|
||||
@ -3401,7 +3401,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
|
||||
{
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
|
||||
Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
|
||||
Gx_data, Gy_data, Gz_data );
|
||||
}
|
||||
|
||||
@ -109,7 +109,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
|
||||
|
||||
/* only for rows corresponding to F2 (notice flag == -1) */
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
gDim, bDim,
|
||||
A_nr_local,
|
||||
A_offd_nnz > 0,
|
||||
@ -160,7 +160,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
|
||||
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
|
||||
gDim, bDim,
|
||||
AF2F_nr_local,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
|
||||
@ -329,7 +329,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
|
||||
dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
|
||||
dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
|
||||
gDim, bDim,
|
||||
AFC_nr_local,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)),
|
||||
@ -388,7 +388,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
|
||||
|
||||
/* only for rows corresponding to F2 (notice flag == -1) */
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
gDim, bDim,
|
||||
A_nr_local,
|
||||
A_offd_nnz > 0,
|
||||
@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
|
||||
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
|
||||
gDim, bDim,
|
||||
AF2F_nr_local,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
|
||||
|
||||
@ -331,7 +331,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */
|
||||
HYPRE_Int *new_end;
|
||||
|
||||
/* init CF_marker_diag and measure_diag: remove some special nodes */
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
|
||||
num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag );
|
||||
|
||||
/* communicate for measure_offd */
|
||||
@ -494,7 +494,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
|
||||
gDim, bDim,
|
||||
graph_diag_size,
|
||||
graph_diag,
|
||||
|
||||
@ -424,7 +424,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real
|
||||
dim3 bDim(BLOCK_SIZE, 1, 1);
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S,
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
|
||||
graph_diag_size, graph_diag, measure_diag, measure_offd,
|
||||
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
|
||||
IS_marker_diag, IS_marker_offd, IS_offd_temp_mark );
|
||||
@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S,
|
||||
/* adjust IS_marker_diag from the received */
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
|
||||
IS_marker_diag, num_elmts_send, send_map_elmts,
|
||||
int_send_buf, IS_offd_temp_mark );
|
||||
|
||||
|
||||
@ -197,7 +197,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
|
||||
n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
|
||||
CF_marker, CF_marker_offd, num_functions,
|
||||
dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
|
||||
@ -228,7 +228,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
|
||||
|
||||
if (interp_type == 3)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
|
||||
n_fine, A_diag_i, A_diag_j, A_diag_data,
|
||||
A_offd_i, A_offd_j, A_offd_data,
|
||||
hypre_ParCSRMatrixSocDiagJ(S),
|
||||
@ -241,7 +241,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
|
||||
}
|
||||
else
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
|
||||
n_fine, A_diag_i, A_diag_j, A_diag_data,
|
||||
A_offd_i, A_offd_j, A_offd_data,
|
||||
hypre_ParCSRMatrixSocDiagJ(S),
|
||||
@ -1161,7 +1161,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
|
||||
n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
|
||||
A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
|
||||
offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);
|
||||
|
||||
@ -162,7 +162,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
|
||||
nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a );
|
||||
|
||||
/* build new P_diag and P_offd */
|
||||
|
||||
@ -87,7 +87,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
gDim, bDim,
|
||||
A_nr_of_rows,
|
||||
A_offd_nnz > 0,
|
||||
@ -128,7 +128,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A,
|
||||
/* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
|
||||
hypre_GpuProfilingPushRange("Compute interp matrix");
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc,
|
||||
gDim, bDim,
|
||||
W_nr_of_rows,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
|
||||
@ -273,7 +273,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
gDim, bDim,
|
||||
A_nr_of_rows,
|
||||
A_offd_nnz > 0,
|
||||
@ -352,7 +352,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A,
|
||||
|
||||
hypre_GpuProfilingPushRange("Compute interp matrix");
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
|
||||
gDim, bDim,
|
||||
W_nr_of_rows,
|
||||
hypre_ParCSRMatrixFirstRowIndex(AFF),
|
||||
@ -502,7 +502,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
|
||||
gDim, bDim,
|
||||
A_nr_of_rows,
|
||||
A_offd_nnz > 0,
|
||||
@ -545,7 +545,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
|
||||
dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
|
||||
hypre_GpuProfilingPushRange("Compute D_tmp");
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
|
||||
gDim, bDim,
|
||||
W_nr_of_rows,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
|
||||
@ -587,7 +587,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A,
|
||||
/* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
|
||||
hypre_GpuProfilingPushRange("Compute interp matrix");
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
|
||||
gDim, bDim,
|
||||
W_nr_of_rows,
|
||||
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
|
||||
|
||||
@ -254,7 +254,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix *A,
|
||||
/* assemble the diagonal part of R from Z */
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
|
||||
n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a);
|
||||
|
||||
num_cols_offd_R = num_cols_offd_Z;
|
||||
|
||||
@ -343,7 +343,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(remaining, "warp", bDim);
|
||||
|
||||
/* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count,
|
||||
gDim, bDim,
|
||||
remaining,
|
||||
current_pass,
|
||||
@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
|
||||
n_fine, A_diag_i, A_diag_j, A_diag_data,
|
||||
A_offd_i, A_offd_j, A_offd_data,
|
||||
CF_marker,
|
||||
@ -591,7 +591,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
|
||||
pass_starts[p + 1], pass_starts[p + 2], pass_order,
|
||||
Pi_diag_i, Pi_diag_j, Pi_diag_data,
|
||||
P_diag_i, P_diag_j, P_diag_data,
|
||||
@ -654,7 +654,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(npoints, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
|
||||
pass_starts[p + 1],
|
||||
pass_starts[p + 2],
|
||||
pass_order,
|
||||
@ -893,7 +893,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
|
||||
num_points, color, pass_order, pass_marker, pass_marker_offd,
|
||||
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
|
||||
P_diag_i, P_offd_i );
|
||||
@ -921,7 +921,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
|
||||
num_points,
|
||||
color,
|
||||
pass_order,
|
||||
@ -1144,7 +1144,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
|
||||
num_points, color, pass_order, pass_marker, pass_marker_offd,
|
||||
S_diag_i, S_diag_j, S_offd_i, S_offd_j,
|
||||
Q_diag_i, Q_offd_i );
|
||||
@ -1173,7 +1173,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
|
||||
num_points,
|
||||
color,
|
||||
pass_order,
|
||||
@ -1244,7 +1244,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
|
||||
num_points, pass_order, A_diag_i, A_diag_data,
|
||||
Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data,
|
||||
w_row_sum );
|
||||
|
||||
@ -155,7 +155,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
|
||||
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim);
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
|
||||
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
|
||||
gDim,
|
||||
bDim,
|
||||
A_num_rows,
|
||||
|
||||
@ -140,7 +140,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
|
||||
|
||||
if (abs_soc)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
|
||||
num_variables, max_row_sum, strength_threshold,
|
||||
A_diag_data, A_diag_i, A_diag_j,
|
||||
A_offd_data, A_offd_i, A_offd_j,
|
||||
@ -150,7 +150,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A,
|
||||
}
|
||||
else
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
|
||||
num_variables, max_row_sum, strength_threshold,
|
||||
A_diag_data, A_diag_i, A_diag_j,
|
||||
A_offd_data, A_offd_i, A_offd_j,
|
||||
|
||||
@ -628,7 +628,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A)
|
||||
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A_diag),
|
||||
hypre_CSRMatrixNumCols(A_diag),
|
||||
@ -745,7 +745,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A_diag),
|
||||
hypre_CSRMatrixNumCols(A_diag),
|
||||
@ -777,7 +777,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
|
||||
|
||||
hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag));
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
|
||||
gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(E_diag),
|
||||
hypre_CSRMatrixNumCols(E_diag),
|
||||
@ -1211,21 +1211,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A,
|
||||
|
||||
if (type == -1)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
|
||||
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
|
||||
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
|
||||
}
|
||||
if (type == 1)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
|
||||
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
|
||||
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
|
||||
}
|
||||
if (type == 2)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
|
||||
hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
|
||||
hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
|
||||
|
||||
@ -714,7 +714,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A )
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
|
||||
nrows, A_i, A_j, A_data);
|
||||
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
@ -751,7 +751,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
|
||||
|
||||
HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
|
||||
hypre_CSRMatrixNumRows(A),
|
||||
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
|
||||
|
||||
@ -845,7 +845,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
|
||||
HYPRE_Int *result = NULL;
|
||||
#endif
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
|
||||
v, hypre_CSRMatrixNumRows(A),
|
||||
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
|
||||
tol, result );
|
||||
@ -940,7 +940,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
|
||||
HYPRE_Int *result = NULL;
|
||||
#endif
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
|
||||
new_diag, v, hypre_CSRMatrixNumRows(A),
|
||||
hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
|
||||
tol, result );
|
||||
@ -1135,17 +1135,17 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
|
||||
|
||||
if (type == 0)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
row_sum, scal, set_or_add[0] == 's' );
|
||||
}
|
||||
else if (type == 1)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
row_sum, scal, set_or_add[0] == 's' );
|
||||
}
|
||||
else if (type == 2)
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
|
||||
row_sum, scal, set_or_add[0] == 's' );
|
||||
}
|
||||
|
||||
@ -1240,7 +1240,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
|
||||
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
}
|
||||
@ -1560,7 +1560,7 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
|
||||
nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
|
||||
|
||||
hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
|
||||
@ -1640,7 +1640,7 @@ hypre_CSRMatrixDiagScaleDevice( hypre_CSRMatrix *A,
|
||||
bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRDiagScale, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRDiagScale, gDim, bDim,
|
||||
nrows, A_i, A_j, A_data, ldata, rdata);
|
||||
|
||||
hypre_SyncComputeStream(hypre_handle());
|
||||
|
||||
@ -506,7 +506,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
|
||||
// for cases where one WARP works on a row
|
||||
dim3 gDim( (m + bDim.z - 1) / bDim.z );
|
||||
|
||||
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
|
||||
HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
|
||||
gDim, bDim, /* shmem_size, */
|
||||
m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a,
|
||||
d_rc, d_rf );
|
||||
@ -542,7 +542,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
|
||||
// for cases where one WARP works on a row
|
||||
dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z );
|
||||
|
||||
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
|
||||
HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
|
||||
gDim, bDim, /* shmem_size, */
|
||||
num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j,
|
||||
d_ghash2_a,
|
||||
@ -563,7 +563,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m,
|
||||
// for cases where one WARP works on a row
|
||||
dim3 gDim( (m + bDim.z - 1) / bDim.z );
|
||||
|
||||
HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
|
||||
bDim,
|
||||
m, d_rf,
|
||||
d_js, d_as,
|
||||
|
||||
@ -467,7 +467,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
|
||||
|
||||
hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume);
|
||||
|
||||
HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
|
||||
HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
|
||||
hash_type > ),
|
||||
gDim, bDim, /* shmem_size, */
|
||||
m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc,
|
||||
@ -493,7 +493,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m,
|
||||
|
||||
/* copy to the final C */
|
||||
dim3 gDim( (m + bDim.z - 1) / bDim.z );
|
||||
HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
|
||||
m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new );
|
||||
|
||||
hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE);
|
||||
|
||||
@ -313,19 +313,19 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int m,
|
||||
* ---------------------------------------------------------------------------*/
|
||||
if (hash_type == 'L')
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
|
||||
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
|
||||
gDim, bDim,
|
||||
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
|
||||
}
|
||||
else if (hash_type == 'Q')
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
|
||||
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
|
||||
gDim, bDim,
|
||||
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
|
||||
}
|
||||
else if (hash_type == 'D')
|
||||
{
|
||||
HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
|
||||
HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
|
||||
gDim, bDim,
|
||||
m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
|
||||
}
|
||||
|
||||
@ -287,11 +287,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
|
||||
|
||||
dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) );
|
||||
|
||||
HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
|
||||
HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
|
||||
|
||||
/* step-1: layer 3-2 */
|
||||
gDim.x = (K + bDim.z - 1) / bDim.z;
|
||||
HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
|
||||
HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
|
||||
bDim,
|
||||
K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0);
|
||||
|
||||
@ -301,7 +301,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
|
||||
d_V3 = (T*) d_rc;
|
||||
|
||||
gDim.x = (M + bDim.z - 1) / bDim.z;
|
||||
HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
|
||||
HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
|
||||
bDim,
|
||||
M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor);
|
||||
|
||||
@ -336,13 +336,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
|
||||
if (row_est_mtd == 1)
|
||||
{
|
||||
/* naive overestimate */
|
||||
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
|
||||
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc );
|
||||
}
|
||||
else if (row_est_mtd == 2)
|
||||
{
|
||||
/* naive underestimate */
|
||||
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
|
||||
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL );
|
||||
}
|
||||
else if (row_est_mtd == 3)
|
||||
@ -361,7 +361,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
|
||||
HYPRE_Int *d_low = d_low_upp;
|
||||
HYPRE_Int *d_upp = d_low_upp + m;
|
||||
|
||||
HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
|
||||
m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp );
|
||||
|
||||
/* Cohen's algorithm, stochastic approach */
|
||||
|
||||
@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int num_rows, /* number of
|
||||
{
|
||||
ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
|
||||
num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
|
||||
}
|
||||
else if (type == 2)
|
||||
{
|
||||
ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
|
||||
HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
|
||||
num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
|
||||
}
|
||||
|
||||
|
||||
@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
|
||||
const HYPRE_Int group_size = 32;
|
||||
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
|
||||
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
|
||||
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
|
||||
}
|
||||
else if (rownnz >= 32)
|
||||
@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
|
||||
const HYPRE_Int group_size = 16;
|
||||
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
|
||||
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
|
||||
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
|
||||
}
|
||||
else if (rownnz >= 16)
|
||||
@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
|
||||
const HYPRE_Int group_size = 8;
|
||||
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
|
||||
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
|
||||
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
|
||||
}
|
||||
else if (rownnz >= 8)
|
||||
@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
|
||||
const HYPRE_Int group_size = 4;
|
||||
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
|
||||
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
|
||||
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
|
||||
}
|
||||
else
|
||||
@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows,
|
||||
const HYPRE_Int group_size = 4;
|
||||
const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
|
||||
const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
|
||||
HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
|
||||
nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
|
||||
}
|
||||
|
||||
|
||||
@ -810,7 +810,7 @@ extern "C++"
|
||||
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
|
||||
HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
|
||||
}
|
||||
}
|
||||
|
||||
@ -871,7 +871,7 @@ extern "C++"
|
||||
hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
|
||||
*/
|
||||
|
||||
HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
|
||||
HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -76,7 +76,7 @@ extern "C++"
|
||||
const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
|
||||
HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,7 +137,7 @@ extern "C++"
|
||||
hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
|
||||
*/
|
||||
|
||||
HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
|
||||
HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -130,8 +130,6 @@ struct hypre_device_allocator
|
||||
#include <oneapi/mkl/rng.hpp>
|
||||
#endif
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA)
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
@ -179,7 +177,7 @@ struct hypre_device_allocator
|
||||
func_name(oneapi::dpl::execution::make_device_policy( \
|
||||
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
|
||||
|
||||
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
{ \
|
||||
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
|
||||
{ \
|
||||
@ -496,7 +494,7 @@ using namespace thrust::placeholders;
|
||||
#define GPU_LAUNCH_SYNC
|
||||
#endif // defined(HYPRE_DEBUG)
|
||||
|
||||
#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
|
||||
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
|
||||
{ \
|
||||
if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \
|
||||
blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \
|
||||
@ -512,7 +510,7 @@ using namespace thrust::placeholders;
|
||||
} \
|
||||
}
|
||||
|
||||
#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
|
||||
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
|
||||
|
||||
/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
|
||||
* The following one works OK for now */
|
||||
@ -1375,7 +1373,7 @@ struct ReduceSum
|
||||
/* 2nd reduction with only *one* block */
|
||||
hypre_assert(nblocks >= 0 && nblocks <= 1024);
|
||||
const dim3 gDim(1), bDim(1024);
|
||||
HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
|
||||
HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
|
||||
hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
|
||||
val += init;
|
||||
}
|
||||
|
||||
@ -267,7 +267,7 @@ struct ReduceSum
|
||||
/* 2nd reduction with only *one* block */
|
||||
hypre_assert(nblocks >= 0 && nblocks <= 1024);
|
||||
const dim3 gDim(1), bDim(1024);
|
||||
HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
|
||||
HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
|
||||
hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
|
||||
val += init;
|
||||
}
|
||||
|
||||
@ -68,7 +68,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
|
||||
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y );
|
||||
HYPRE_GPU_LAUNCH( hypreSYCLKernel_IVAXPY, gDim, bDim, n, a, x, y );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -103,7 +103,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
|
||||
sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
|
||||
HYPRE_GPU_LAUNCH( hypreSYCLKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -156,7 +156,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
|
||||
|
||||
HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
|
||||
|
||||
HYPRE_SYCL_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind );
|
||||
HYPRE_GPU_LAUNCH( hypreSYCLKernel_ScatterRowPtr, gDim, bDim, nrows, d_row_ptr, d_row_ind );
|
||||
|
||||
HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
|
||||
oneapi::dpl::maximum<HYPRE_Int>());
|
||||
@ -234,7 +234,7 @@ void hypre_CudaCompileFlagCheck()
|
||||
HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
|
||||
hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE,
|
||||
HYPRE_MEMORY_HOST);
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
|
||||
hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST,
|
||||
HYPRE_MEMORY_DEVICE);
|
||||
//hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
|
||||
@ -345,7 +345,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
|
||||
d_offd_ia, d_rownnz );
|
||||
|
||||
return hypre_error_flag;
|
||||
@ -485,7 +485,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows,
|
||||
}
|
||||
*/
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
|
||||
nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
|
||||
d_diag_i, d_diag_j, d_diag_a,
|
||||
d_offd_i, d_offd_j, d_offd_a,
|
||||
@ -696,7 +696,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
|
||||
/* trivial cases, n = 1, 2 */
|
||||
dim3 bDim = 1;
|
||||
dim3 gDim = 1;
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -735,7 +735,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
|
||||
reduced_n, x, reduced_map, reduced_y );
|
||||
|
||||
if (!work)
|
||||
@ -778,7 +778,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -812,7 +812,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -846,7 +846,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -885,7 +885,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -920,7 +920,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
@ -944,7 +944,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si
|
||||
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
|
||||
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
|
||||
|
||||
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
|
||||
HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
|
||||
|
||||
return hypre_error_flag;
|
||||
}
|
||||
|
||||
@ -73,8 +73,6 @@
|
||||
#include <oneapi/mkl/rng.hpp>
|
||||
#endif
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
|
||||
|
||||
#endif // defined(HYPRE_USING_CUDA)
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
@ -122,7 +120,7 @@
|
||||
func_name(oneapi::dpl::execution::make_device_policy( \
|
||||
*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
|
||||
|
||||
#define HYPRE_SYCL_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \
|
||||
{ \
|
||||
if ( gridsize[0] == 0 || blocksize[0] == 0 ) \
|
||||
{ \
|
||||
@ -439,7 +437,7 @@ using namespace thrust::placeholders;
|
||||
#define GPU_LAUNCH_SYNC
|
||||
#endif // defined(HYPRE_DEBUG)
|
||||
|
||||
#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
|
||||
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \
|
||||
{ \
|
||||
if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \
|
||||
blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \
|
||||
@ -455,7 +453,7 @@ using namespace thrust::placeholders;
|
||||
} \
|
||||
}
|
||||
|
||||
#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
|
||||
#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
|
||||
|
||||
/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
|
||||
* The following one works OK for now */
|
||||
|
||||
Loading…
Reference in New Issue
Block a user