Merge branch 'interp_trunc' of github.com:hypre-space/hypre into interp_trunc
This commit is contained in:
commit
750d4877a4
@ -43,10 +43,10 @@ hypreCUDAKernel_InterpTruncationPass0_v1( hypre_DeviceItem &item,
|
||||
p_diag = read_only_load(P_diag_i + row + lane);
|
||||
p_offd = read_only_load(P_offd_i + row + lane);
|
||||
}
|
||||
q_diag = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag, 1);
|
||||
p_diag = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag, 0);
|
||||
q_offd = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd, 1);
|
||||
p_offd = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd, 0);
|
||||
q_diag = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p_diag, 1);
|
||||
p_diag = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p_diag, 0);
|
||||
q_offd = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p_offd, 1);
|
||||
p_offd = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p_offd, 0);
|
||||
|
||||
/* 1. compute row rowsum, rowmax */
|
||||
for (HYPRE_Int i = p_diag + lane; i < q_diag; i += HYPRE_WARP_SIZE)
|
||||
@ -69,7 +69,7 @@ hypreCUDAKernel_InterpTruncationPass0_v1( hypre_DeviceItem &item,
|
||||
HYPRE_Int cnt_diag = 0, cnt_offd = 0;
|
||||
|
||||
/* 2. move wanted entries to the front and row scal */
|
||||
for (HYPRE_Int i = p_diag + lane; __any_sync(HYPRE_WARP_FULL_MASK, i < q_diag);
|
||||
for (HYPRE_Int i = p_diag + lane; warp_any_sync(item, HYPRE_WARP_FULL_MASK, i < q_diag);
|
||||
i += HYPRE_WARP_SIZE)
|
||||
{
|
||||
HYPRE_Real v = 0.0;
|
||||
@ -98,7 +98,7 @@ hypreCUDAKernel_InterpTruncationPass0_v1( hypre_DeviceItem &item,
|
||||
cnt_diag += sum;
|
||||
}
|
||||
|
||||
for (HYPRE_Int i = p_offd + lane; __any_sync(HYPRE_WARP_FULL_MASK, i < q_offd);
|
||||
for (HYPRE_Int i = p_offd + lane; warp_any_sync(item, HYPRE_WARP_FULL_MASK, i < q_offd);
|
||||
i += HYPRE_WARP_SIZE)
|
||||
{
|
||||
HYPRE_Real v = 0.0;
|
||||
@ -365,9 +365,9 @@ hypreCUDAKernel_InterpTruncationPass2_v1( hypre_DeviceItem &item,
|
||||
p = read_only_load(P_diag_i + i + lane);
|
||||
pnew = read_only_load(P_diag_i_new + i + lane);
|
||||
}
|
||||
p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
|
||||
qnew = __shfl_sync(HYPRE_WARP_FULL_MASK, pnew, 1);
|
||||
pnew = __shfl_sync(HYPRE_WARP_FULL_MASK, pnew, 0);
|
||||
p = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 0);
|
||||
qnew = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, pnew, 1);
|
||||
pnew = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, pnew, 0);
|
||||
|
||||
shift = p - pnew;
|
||||
for (HYPRE_Int k = pnew + lane; k < qnew; k += HYPRE_WARP_SIZE)
|
||||
@ -381,9 +381,9 @@ hypreCUDAKernel_InterpTruncationPass2_v1( hypre_DeviceItem &item,
|
||||
p = read_only_load(P_offd_i + i + lane);
|
||||
pnew = read_only_load(P_offd_i_new + i + lane);
|
||||
}
|
||||
p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
|
||||
qnew = __shfl_sync(HYPRE_WARP_FULL_MASK, pnew, 1);
|
||||
pnew = __shfl_sync(HYPRE_WARP_FULL_MASK, pnew, 0);
|
||||
p = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 0);
|
||||
qnew = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, pnew, 1);
|
||||
pnew = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, pnew, 0);
|
||||
|
||||
shift = p - pnew;
|
||||
for (HYPRE_Int k = pnew + lane; k < qnew; k += HYPRE_WARP_SIZE)
|
||||
|
||||
@ -1,81 +1,81 @@
|
||||
# Output file: benchmark_ij.out.1
|
||||
PCG Setup wall clock time = 0.512658 seconds
|
||||
PCG Solve wall clock time = 0.552540 seconds
|
||||
PCG Setup wall clock time = 0.359271 seconds
|
||||
PCG Solve wall clock time = 0.504610 seconds
|
||||
# Output file: benchmark_ij.out.2
|
||||
PCG Setup wall clock time = 0.333322 seconds
|
||||
PCG Solve wall clock time = 0.450435 seconds
|
||||
PCG Setup wall clock time = 0.312138 seconds
|
||||
PCG Solve wall clock time = 0.416958 seconds
|
||||
# Output file: benchmark_ij.out.3
|
||||
PCG Setup wall clock time = 0.471184 seconds
|
||||
PCG Solve wall clock time = 1.044233 seconds
|
||||
PCG Setup wall clock time = 0.448648 seconds
|
||||
PCG Solve wall clock time = 1.005090 seconds
|
||||
# Output file: benchmark_ij.out.4
|
||||
PCG Setup wall clock time = 0.504893 seconds
|
||||
PCG Solve wall clock time = 0.255745 seconds
|
||||
PCG Setup wall clock time = 0.477832 seconds
|
||||
PCG Solve wall clock time = 0.215731 seconds
|
||||
# Output file: benchmark_ij.out.5
|
||||
PCG Setup wall clock time = 0.480607 seconds
|
||||
PCG Solve wall clock time = 0.179937 seconds
|
||||
PCG Setup wall clock time = 0.433146 seconds
|
||||
PCG Solve wall clock time = 0.182600 seconds
|
||||
# Output file: benchmark_ij.out.6
|
||||
PCG Setup wall clock time = 0.459764 seconds
|
||||
PCG Solve wall clock time = 0.937065 seconds
|
||||
PCG Setup wall clock time = 0.419559 seconds
|
||||
PCG Solve wall clock time = 0.929381 seconds
|
||||
# Output file: benchmark_ij.out.7
|
||||
PCG Setup wall clock time = 0.589380 seconds
|
||||
PCG Solve wall clock time = 0.227287 seconds
|
||||
PCG Setup wall clock time = 0.498759 seconds
|
||||
PCG Solve wall clock time = 0.217208 seconds
|
||||
# Output file: benchmark_ij.out.8
|
||||
PCG Setup wall clock time = 0.371488 seconds
|
||||
PCG Solve wall clock time = 0.539031 seconds
|
||||
PCG Setup wall clock time = 0.336192 seconds
|
||||
PCG Solve wall clock time = 0.492642 seconds
|
||||
# Output file: benchmark_ij.out.9
|
||||
PCG Setup wall clock time = 0.314890 seconds
|
||||
PCG Solve wall clock time = 0.434536 seconds
|
||||
PCG Setup wall clock time = 0.317082 seconds
|
||||
PCG Solve wall clock time = 0.408582 seconds
|
||||
# Output file: benchmark_ij.out.10
|
||||
PCG Setup wall clock time = 0.469411 seconds
|
||||
PCG Solve wall clock time = 1.003551 seconds
|
||||
PCG Setup wall clock time = 0.415598 seconds
|
||||
PCG Solve wall clock time = 0.956484 seconds
|
||||
# Output file: benchmark_ij.out.11
|
||||
PCG Setup wall clock time = 0.500090 seconds
|
||||
PCG Solve wall clock time = 0.241589 seconds
|
||||
PCG Setup wall clock time = 0.464038 seconds
|
||||
PCG Solve wall clock time = 0.228662 seconds
|
||||
# Output file: benchmark_ij.out.12
|
||||
PCG Setup wall clock time = 0.462413 seconds
|
||||
PCG Solve wall clock time = 0.184819 seconds
|
||||
PCG Setup wall clock time = 0.400749 seconds
|
||||
PCG Solve wall clock time = 0.179209 seconds
|
||||
# Output file: benchmark_ij.out.13
|
||||
PCG Setup wall clock time = 0.439033 seconds
|
||||
PCG Solve wall clock time = 0.940684 seconds
|
||||
PCG Setup wall clock time = 0.459325 seconds
|
||||
PCG Solve wall clock time = 0.950444 seconds
|
||||
# Output file: benchmark_ij.out.14
|
||||
PCG Setup wall clock time = 1.236288 seconds
|
||||
PCG Solve wall clock time = 0.873573 seconds
|
||||
PCG Setup wall clock time = 0.969487 seconds
|
||||
PCG Solve wall clock time = 0.908802 seconds
|
||||
# Output file: benchmark_ij.out.15
|
||||
PCG Setup wall clock time = 1.140977 seconds
|
||||
PCG Solve wall clock time = 0.990117 seconds
|
||||
PCG Setup wall clock time = 0.883830 seconds
|
||||
PCG Solve wall clock time = 0.983369 seconds
|
||||
# Output file: benchmark_ij.out.16
|
||||
PCG Setup wall clock time = 1.167420 seconds
|
||||
PCG Solve wall clock time = 0.876063 seconds
|
||||
PCG Setup wall clock time = 0.901717 seconds
|
||||
PCG Solve wall clock time = 0.867845 seconds
|
||||
# Output file: benchmark_ij.out.17
|
||||
PCG Setup wall clock time = 0.467263 seconds
|
||||
PCG Solve wall clock time = 0.192955 seconds
|
||||
PCG Setup wall clock time = 0.466032 seconds
|
||||
PCG Solve wall clock time = 0.188925 seconds
|
||||
# Output file: benchmark_ij.out.18
|
||||
PCG Setup wall clock time = 0.584768 seconds
|
||||
PCG Solve wall clock time = 0.181109 seconds
|
||||
PCG Setup wall clock time = 0.514625 seconds
|
||||
PCG Solve wall clock time = 0.164335 seconds
|
||||
# Output file: benchmark_ij.out.19
|
||||
PCG Setup wall clock time = 0.529740 seconds
|
||||
PCG Solve wall clock time = 0.473137 seconds
|
||||
PCG Setup wall clock time = 0.510175 seconds
|
||||
PCG Solve wall clock time = 0.466027 seconds
|
||||
# Output file: benchmark_ij.out.20
|
||||
PCG Setup wall clock time = 0.342027 seconds
|
||||
PCG Solve wall clock time = 0.270525 seconds
|
||||
PCG Setup wall clock time = 0.303927 seconds
|
||||
PCG Solve wall clock time = 0.249723 seconds
|
||||
# Output file: benchmark_ij.out.21
|
||||
PCG Setup wall clock time = 0.503688 seconds
|
||||
PCG Solve wall clock time = 0.272246 seconds
|
||||
PCG Setup wall clock time = 0.477654 seconds
|
||||
PCG Solve wall clock time = 0.243726 seconds
|
||||
# Output file: benchmark_ij.out.22
|
||||
PCG Setup wall clock time = 0.578109 seconds
|
||||
PCG Solve wall clock time = 0.189868 seconds
|
||||
PCG Setup wall clock time = 0.510603 seconds
|
||||
PCG Solve wall clock time = 0.187041 seconds
|
||||
# Output file: benchmark_ij.out.23
|
||||
PCG Setup wall clock time = 0.582089 seconds
|
||||
PCG Solve wall clock time = 0.211962 seconds
|
||||
PCG Setup wall clock time = 0.538188 seconds
|
||||
PCG Solve wall clock time = 0.192323 seconds
|
||||
# Output file: benchmark_ij.out.24
|
||||
PCG Setup wall clock time = 0.545068 seconds
|
||||
PCG Solve wall clock time = 0.232222 seconds
|
||||
PCG Setup wall clock time = 0.484426 seconds
|
||||
PCG Solve wall clock time = 0.203396 seconds
|
||||
# Output file: benchmark_ij.out.25
|
||||
PCG Setup wall clock time = 0.529062 seconds
|
||||
PCG Solve wall clock time = 0.254237 seconds
|
||||
PCG Setup wall clock time = 0.504628 seconds
|
||||
PCG Solve wall clock time = 0.211952 seconds
|
||||
# Output file: benchmark_ij.out.26
|
||||
PCG Setup wall clock time = 0.487924 seconds
|
||||
PCG Solve wall clock time = 0.566693 seconds
|
||||
PCG Setup wall clock time = 0.479777 seconds
|
||||
PCG Solve wall clock time = 0.542081 seconds
|
||||
# Output file: benchmark_ij.out.27
|
||||
PCG Setup wall clock time = 0.548090 seconds
|
||||
PCG Solve wall clock time = 0.406431 seconds
|
||||
PCG Setup wall clock time = 0.539575 seconds
|
||||
PCG Solve wall clock time = 0.392673 seconds
|
||||
|
||||
@ -1,66 +1,66 @@
|
||||
# Output file: benchmark_ij.out.1
|
||||
Iterations = 57
|
||||
Final Relative Residual Norm = 9.852266e-09
|
||||
Iterations = 55
|
||||
Final Relative Residual Norm = 9.274314e-09
|
||||
|
||||
# Output file: benchmark_ij.out.2
|
||||
Iterations = 47
|
||||
Final Relative Residual Norm = 8.930988e-09
|
||||
Iterations = 46
|
||||
Final Relative Residual Norm = 9.988931e-09
|
||||
|
||||
# Output file: benchmark_ij.out.3
|
||||
Iterations = 95
|
||||
Final Relative Residual Norm = 7.745498e-09
|
||||
Iterations = 98
|
||||
Final Relative Residual Norm = 7.887687e-09
|
||||
|
||||
# Output file: benchmark_ij.out.4
|
||||
Iterations = 21
|
||||
Final Relative Residual Norm = 4.026296e-09
|
||||
Final Relative Residual Norm = 3.469827e-09
|
||||
|
||||
# Output file: benchmark_ij.out.5
|
||||
Iterations = 18
|
||||
Final Relative Residual Norm = 8.938045e-09
|
||||
Final Relative Residual Norm = 4.722489e-09
|
||||
|
||||
# Output file: benchmark_ij.out.6
|
||||
Iterations = 87
|
||||
Final Relative Residual Norm = 8.301552e-09
|
||||
Final Relative Residual Norm = 7.730705e-09
|
||||
|
||||
# Output file: benchmark_ij.out.7
|
||||
Iterations = 20
|
||||
Final Relative Residual Norm = 4.830682e-09
|
||||
Final Relative Residual Norm = 4.901688e-09
|
||||
|
||||
# Output file: benchmark_ij.out.8
|
||||
Iterations = 55
|
||||
Final Relative Residual Norm = 7.448484e-09
|
||||
Final Relative Residual Norm = 8.477809e-09
|
||||
|
||||
# Output file: benchmark_ij.out.9
|
||||
Iterations = 46
|
||||
Final Relative Residual Norm = 8.999363e-09
|
||||
Final Relative Residual Norm = 9.985837e-09
|
||||
|
||||
# Output file: benchmark_ij.out.10
|
||||
Iterations = 93
|
||||
Final Relative Residual Norm = 8.530801e-09
|
||||
Final Relative Residual Norm = 8.530145e-09
|
||||
|
||||
# Output file: benchmark_ij.out.11
|
||||
Iterations = 21
|
||||
Final Relative Residual Norm = 4.026296e-09
|
||||
Final Relative Residual Norm = 3.469827e-09
|
||||
|
||||
# Output file: benchmark_ij.out.12
|
||||
Iterations = 18
|
||||
Final Relative Residual Norm = 8.938045e-09
|
||||
Final Relative Residual Norm = 4.722489e-09
|
||||
|
||||
# Output file: benchmark_ij.out.13
|
||||
Iterations = 87
|
||||
Final Relative Residual Norm = 8.301486e-09
|
||||
Final Relative Residual Norm = 7.730127e-09
|
||||
|
||||
# Output file: benchmark_ij.out.14
|
||||
Iterations = 20
|
||||
Final Relative Residual Norm = 8.245385e-09
|
||||
Iterations = 21
|
||||
Final Relative Residual Norm = 3.509276e-09
|
||||
|
||||
# Output file: benchmark_ij.out.15
|
||||
Iterations = 22
|
||||
Final Relative Residual Norm = 4.816512e-09
|
||||
Final Relative Residual Norm = 5.410863e-09
|
||||
|
||||
# Output file: benchmark_ij.out.16
|
||||
Iterations = 20
|
||||
Final Relative Residual Norm = 8.240873e-09
|
||||
Final Relative Residual Norm = 9.649695e-09
|
||||
|
||||
# Output file: benchmark_ij.out.17
|
||||
Iterations = 20
|
||||
@ -68,35 +68,35 @@ Final Relative Residual Norm = 3.529822e-09
|
||||
|
||||
# Output file: benchmark_ij.out.18
|
||||
Iterations = 20
|
||||
Final Relative Residual Norm = 3.555075e-09
|
||||
Final Relative Residual Norm = 3.320123e-09
|
||||
|
||||
# Output file: benchmark_ij.out.19
|
||||
Iterations = 43
|
||||
Final Relative Residual Norm = 6.278339e-09
|
||||
Final Relative Residual Norm = 7.541043e-09
|
||||
|
||||
# Output file: benchmark_ij.out.20
|
||||
Iterations = 38
|
||||
Final Relative Residual Norm = 7.063480e-09
|
||||
Final Relative Residual Norm = 4.440293e-09
|
||||
|
||||
# Output file: benchmark_ij.out.21
|
||||
Iterations = 26
|
||||
Final Relative Residual Norm = 8.638600e-09
|
||||
Iterations = 27
|
||||
Final Relative Residual Norm = 4.476621e-09
|
||||
|
||||
# Output file: benchmark_ij.out.22
|
||||
Iterations = 12
|
||||
Final Relative Residual Norm = 8.893885e-09
|
||||
Iterations = 13
|
||||
Final Relative Residual Norm = 1.703911e-09
|
||||
|
||||
# Output file: benchmark_ij.out.23
|
||||
Iterations = 13
|
||||
Final Relative Residual Norm = 6.269155e-09
|
||||
Final Relative Residual Norm = 5.943877e-09
|
||||
|
||||
# Output file: benchmark_ij.out.24
|
||||
Iterations = 14
|
||||
Final Relative Residual Norm = 2.060470e-09
|
||||
Final Relative Residual Norm = 3.250031e-09
|
||||
|
||||
# Output file: benchmark_ij.out.25
|
||||
Iterations = 15
|
||||
Final Relative Residual Norm = 5.160772e-09
|
||||
Final Relative Residual Norm = 5.483580e-09
|
||||
|
||||
# Output file: benchmark_ij.out.26
|
||||
Iterations = 37
|
||||
|
||||
Loading…
Reference in New Issue
Block a user