diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c index 59cc37fb4..47d6be2a3 100644 --- a/src/parcsr_ls/par_relax.c +++ b/src/parcsr_ls/par_relax.c @@ -1109,8 +1109,12 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, hypre_Vector l1_norms_vec; hypre_ParVector l1_norms_parvec; + hypre_VectorNumVectors(&l1_norms_vec) = 1; + hypre_VectorMultiVecStorageMethod(&l1_norms_vec) = 0; + hypre_VectorOwnsData(&l1_norms_vec) = 0; hypre_VectorData(&l1_norms_vec) = l1_norms; hypre_VectorSize(&l1_norms_vec) = num_rows; + /* TODO XXX * The next line is NOT 100% correct, which should be the memory location of l1_norms instead of f * But how do I know it? As said, don't use raw pointers, don't use raw pointers! diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h index 32b5f1dec..08be2e49e 100644 --- a/src/seq_mv/protos.h +++ b/src/seq_mv/protos.h @@ -236,6 +236,7 @@ HYPRE_Int hypre_SeqVectorInitialize_v2( hypre_Vector *vector, HYPRE_MemoryLocation memory_location ); HYPRE_Int hypre_SeqVectorInitialize ( hypre_Vector *vector ); HYPRE_Int hypre_SeqVectorSetDataOwner ( hypre_Vector *vector, HYPRE_Int owns_data ); +HYPRE_Int hypre_SeqVectorSetSize ( hypre_Vector *vector, HYPRE_Int size ); hypre_Vector *hypre_SeqVectorRead ( char *file_name ); HYPRE_Int hypre_SeqVectorPrint ( hypre_Vector *vector, char *file_name ); HYPRE_Int hypre_SeqVectorSetConstantValues ( hypre_Vector *v, HYPRE_Complex value ); diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h index beedb1334..b80a2f0e8 100644 --- a/src/seq_mv/seq_mv.h +++ b/src/seq_mv/seq_mv.h @@ -508,6 +508,7 @@ HYPRE_Int hypre_SeqVectorInitialize_v2( hypre_Vector *vector, HYPRE_MemoryLocation memory_location ); HYPRE_Int hypre_SeqVectorInitialize ( hypre_Vector *vector ); HYPRE_Int hypre_SeqVectorSetDataOwner ( hypre_Vector *vector, HYPRE_Int owns_data ); +HYPRE_Int hypre_SeqVectorSetSize ( hypre_Vector *vector, HYPRE_Int size ); hypre_Vector *hypre_SeqVectorRead ( char *file_name ); HYPRE_Int hypre_SeqVectorPrint ( hypre_Vector *vector, char *file_name ); HYPRE_Int hypre_SeqVectorSetConstantValues ( hypre_Vector *v, HYPRE_Complex value ); diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c index bba2e3d83..fccd58ebe 100644 --- a/src/seq_mv/vector.c +++ b/src/seq_mv/vector.c @@ -60,13 +60,11 @@ hypre_SeqMultiVectorCreate( HYPRE_Int size, HYPRE_Int num_vectors ) HYPRE_Int hypre_SeqVectorDestroy( hypre_Vector *vector ) { - HYPRE_Int ierr = 0; - if (vector) { HYPRE_MemoryLocation memory_location = hypre_VectorMemoryLocation(vector); - if ( hypre_VectorOwnsData(vector) ) + if (hypre_VectorOwnsData(vector)) { hypre_TFree(hypre_VectorData(vector), memory_location); } @@ -74,7 +72,7 @@ hypre_SeqVectorDestroy( hypre_Vector *vector ) hypre_TFree(vector, HYPRE_MEMORY_HOST); } - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -87,7 +85,6 @@ HYPRE_Int hypre_SeqVectorInitialize_v2( hypre_Vector *vector, HYPRE_MemoryLocation memory_location ) { HYPRE_Int size = hypre_VectorSize(vector); - HYPRE_Int ierr = 0; HYPRE_Int num_vectors = hypre_VectorNumVectors(vector); HYPRE_Int multivec_storage_method = hypre_VectorMultiVecStorageMethod(vector); @@ -97,27 +94,28 @@ hypre_SeqVectorInitialize_v2( hypre_Vector *vector, HYPRE_MemoryLocation memory_ * to be consistent with `memory_location' * Otherwise, mismatches will exist and problems will be encountered * when being used, and freed */ - if ( !hypre_VectorData(vector) ) + if (!hypre_VectorData(vector)) { hypre_VectorData(vector) = hypre_CTAlloc(HYPRE_Complex, num_vectors * size, memory_location); } - if ( multivec_storage_method == 0 ) + if (multivec_storage_method == 0) { hypre_VectorVectorStride(vector) = size; - hypre_VectorIndexStride(vector) = 1; + hypre_VectorIndexStride(vector) = 1; } - else if ( multivec_storage_method == 1 ) + else if (multivec_storage_method == 1) { hypre_VectorVectorStride(vector) = 1; - hypre_VectorIndexStride(vector) = num_vectors; + hypre_VectorIndexStride(vector) = num_vectors; } else { - ++ierr; + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Invalid multivec storage method!\n"); + return hypre_error_flag; } - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -127,11 +125,7 @@ hypre_SeqVectorInitialize_v2( hypre_Vector *vector, HYPRE_MemoryLocation memory_ HYPRE_Int hypre_SeqVectorInitialize( hypre_Vector *vector ) { - HYPRE_Int ierr; - - ierr = hypre_SeqVectorInitialize_v2( vector, hypre_VectorMemoryLocation(vector) ); - - return ierr; + return hypre_SeqVectorInitialize_v2(vector, hypre_VectorMemoryLocation(vector)); } /*-------------------------------------------------------------------------- @@ -142,11 +136,28 @@ HYPRE_Int hypre_SeqVectorSetDataOwner( hypre_Vector *vector, HYPRE_Int owns_data ) { - HYPRE_Int ierr = 0; - hypre_VectorOwnsData(vector) = owns_data; - return ierr; + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + * hypre_SeqVectorSetSize + *--------------------------------------------------------------------------*/ + +HYPRE_Int +hypre_SeqVectorSetSize( hypre_Vector *vector, + HYPRE_Int size ) +{ + HYPRE_Int multivec_storage_method = hypre_VectorMultiVecStorageMethod(vector); + + hypre_VectorSize(vector) = size; + if (multivec_storage_method == 0) + { + hypre_VectorVectorStride(vector) = size; + } + + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -201,7 +212,7 @@ HYPRE_Int hypre_SeqVectorPrint( hypre_Vector *vector, char *file_name ) { - FILE *fp; + FILE *fp; HYPRE_Complex *data; HYPRE_Int size, num_vectors, vecstride, idxstride; @@ -209,8 +220,6 @@ hypre_SeqVectorPrint( hypre_Vector *vector, HYPRE_Int i, j; HYPRE_Complex value; - HYPRE_Int ierr = 0; - num_vectors = hypre_VectorNumVectors(vector); vecstride = hypre_VectorVectorStride(vector); idxstride = hypre_VectorIndexStride(vector); @@ -265,7 +274,7 @@ hypre_SeqVectorPrint( hypre_Vector *vector, fclose(fp); - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -281,51 +290,65 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, #endif HYPRE_Complex *vector_data = hypre_VectorData(v); + HYPRE_Int num_vectors = hypre_VectorNumVectors(v); HYPRE_Int size = hypre_VectorSize(v); - HYPRE_Int ierr = 0; + HYPRE_Int total_size = size * num_vectors; - size *= hypre_VectorNumVectors(v); - - //hypre_SeqVectorPrefetch(v, HYPRE_MEMORY_DEVICE); - -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - if (size > 0) + /* Trivial case */ + if (total_size <= 0) { - hypreDevice_ComplexFilln( vector_data, size, value ); + return hypre_error_flag; } -#elif defined(HYPRE_USING_SYCL) - if (size > 0) - { - HYPRE_ONEDPL_CALL( std::fill_n, vector_data, size, value ); - } -#else - HYPRE_Int i; -#if defined(HYPRE_USING_DEVICE_OPENMP) - #pragma omp target teams distribute parallel for private(i) is_device_ptr(vector_data) -#elif defined(HYPRE_USING_OPENMP) - #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE -#endif - for (i = 0; i < size; i++) - { - vector_data[i] = value; - } -#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncComputeStream(hypre_handle()); + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(hypre_VectorMemoryLocation(v)); + + //hypre_SeqVectorPrefetch(v, HYPRE_MEMORY_DEVICE); + if (exec == HYPRE_EXEC_DEVICE) + { +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + hypreDevice_ComplexFilln(vector_data, total_size, value); + +#elif defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL(std::fill_n, vector_data, total_size, value); + +#elif defined(HYPRE_USING_DEVICE_OPENMP) + HYPRE_Int i; + + #pragma omp target teams distribute parallel for private(i) is_device_ptr(vector_data) + for (i = 0; i < total_size; i++) + { + vector_data[i] = value; + } #endif + hypre_SyncComputeStream(hypre_handle()); + } + else +#endif /* defined(HYPRE_USING_GPU) */ + { + HYPRE_Int i; + +#if defined(HYPRE_USING_OPENMP) + #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < total_size; i++) + { + vector_data[i] = value; + } + } + #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); #endif - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- * hypre_SeqVectorSetRandomValues * - * returns vector of values randomly distributed between -1.0 and +1.0 + * returns vector of values randomly distributed between -1.0 and +1.0 *--------------------------------------------------------------------------*/ HYPRE_Int @@ -335,9 +358,8 @@ hypre_SeqVectorSetRandomValues( hypre_Vector *v, HYPRE_Complex *vector_data = hypre_VectorData(v); HYPRE_Int size = hypre_VectorSize(v); HYPRE_Int i; - HYPRE_Int ierr = 0; - hypre_SeedRand(seed); + hypre_SeedRand(seed); size *= hypre_VectorNumVectors(v); if (hypre_GetActualMemLocation(hypre_VectorMemoryLocation(v)) == hypre_MEMORY_HOST) @@ -360,7 +382,7 @@ hypre_SeqVectorSetRandomValues( hypre_Vector *v, hypre_TFree(h_data, HYPRE_MEMORY_HOST); } - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -377,8 +399,6 @@ hypre_SeqVectorCopy( hypre_Vector *x, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif - HYPRE_Int ierr = 0; - size_t size = hypre_min( hypre_VectorSize(x), hypre_VectorSize(y) ) * hypre_VectorNumVectors(x); hypre_TMemcpy( hypre_VectorData(y), @@ -392,7 +412,7 @@ hypre_SeqVectorCopy( hypre_Vector *x, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); #endif - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -443,7 +463,7 @@ hypre_SeqVectorCloneShallow( hypre_Vector *x ) hypre_VectorMemoryLocation(y) = hypre_VectorMemoryLocation(x); hypre_VectorData(y) = hypre_VectorData(x); - hypre_SeqVectorSetDataOwner( y, 0 ); + hypre_SeqVectorSetDataOwner(y, 0); hypre_SeqVectorInitialize(y); return y; @@ -452,6 +472,7 @@ hypre_SeqVectorCloneShallow( hypre_Vector *x ) /*-------------------------------------------------------------------------- * hypre_SeqVectorScale *--------------------------------------------------------------------------*/ + HYPRE_Int hypre_SeqVectorScale( HYPRE_Complex alpha, hypre_Vector *y ) @@ -473,7 +494,6 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, HYPRE_Complex *y_data = hypre_VectorData(y); HYPRE_Int size = hypre_VectorSize(y); - HYPRE_Int ierr = 0; size *= hypre_VectorNumVectors(y); @@ -526,7 +546,7 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); #endif - return ierr; + return hypre_error_flag; } /*-------------------------------------------------------------------------- @@ -544,7 +564,6 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, HYPRE_Complex *x_data = hypre_VectorData(x); HYPRE_Complex *y_data = hypre_VectorData(y); HYPRE_Int size = hypre_VectorSize(x); - HYPRE_Int ierr = 0; size *= hypre_VectorNumVectors(x); @@ -599,7 +618,7 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); #endif - return ierr; + return hypre_error_flag; } /* y = y + x ./ b */ @@ -612,10 +631,26 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif - HYPRE_Complex *x_data = hypre_VectorData(x); - HYPRE_Complex *b_data = hypre_VectorData(b); - HYPRE_Complex *y_data = hypre_VectorData(y); - HYPRE_Int size = hypre_VectorSize(b); + HYPRE_Complex *x_data = hypre_VectorData(x); + HYPRE_Complex *b_data = hypre_VectorData(b); + HYPRE_Complex *y_data = hypre_VectorData(y); + HYPRE_Int num_vectors_x = hypre_VectorNumVectors(x); + HYPRE_Int num_vectors_y = hypre_VectorNumVectors(y); + HYPRE_Int num_vectors_b = hypre_VectorNumVectors(b); + HYPRE_Int size = hypre_VectorSize(x); + + /* Sanity checks */ + if (hypre_VectorSize(x) != hypre_VectorSize(y) || + hypre_VectorSize(y) != hypre_VectorSize(b)) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Error: size of x, y, and b do not match!\n"); + return hypre_error_flag; + } + + /* row-wise multivec is not supportted */ + hypre_assert(hypre_VectorMultiVecStorageMethod(x) == 0); + hypre_assert(hypre_VectorMultiVecStorageMethod(b) == 0); + hypre_assert(hypre_VectorMultiVecStorageMethod(y) == 0); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_VectorMemoryLocation(x), hypre_VectorMemoryLocation(b) ); @@ -630,18 +665,84 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, #pragma omp target teams distribute parallel for private(i) is_device_ptr(u_data,v_data,l1_norms) #endif */ - hypreDevice_IVAXPY(size, b_data, x_data, y_data); + + if (num_vectors_b == 1) + { + if (num_vectors_x == 1) + { + hypreDevice_IVAXPY(size, b_data, x_data, y_data); + } + else if (num_vectors_x == num_vectors_y) + { + hypreDevice_IVAMXPMY(num_vectors_x, size, b_data, x_data, y_data); + } + else + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unsupported combination of num_vectors!\n"); + return hypre_error_flag; + } + } + else + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unsupported combination of num_vectors!\n"); + return hypre_error_flag; + } } else #endif { - HYPRE_Int i; -#ifdef HYPRE_USING_OPENMP - #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE -#endif - for (i = 0; i < size; i++) + HYPRE_Int i, j; + + if (num_vectors_b == 1) { - y_data[i] += x_data[i] / b_data[i]; + if (num_vectors_x == 1 && num_vectors_y == 1) + { +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < size; i++) + { + y_data[i] += x_data[i] / b_data[i]; + } + } + else if (num_vectors_x == 2 && num_vectors_y == 2) + { +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < size; i++) + { + HYPRE_Complex val = 1.0 / b_data[i]; + + y_data[i] += x_data[i] * val; + y_data[i + size] += x_data[i + size] * val; + } + } + else if (num_vectors_x == num_vectors_y) + { +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(i, j) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < size; i++) + { + HYPRE_Complex val = 1.0 / b_data[i]; + + for (j = 0; j < num_vectors_x; j++) + { + y_data[i + size * j] += x_data[i + size * j] * val; + } + } + } + else + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unsupported combination of num_vectors!\n"); + return hypre_error_flag; + } + } + else + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Unsupported combination of num_vectors!\n"); + return hypre_error_flag; } } @@ -813,7 +914,6 @@ HYPRE_Complex hypre_SeqVectorSumElts( hypre_Vector *vector ) HYPRE_Int hypre_SeqVectorPrefetch( hypre_Vector *x, HYPRE_MemoryLocation memory_location) { - HYPRE_Int ierr = 0; #ifdef HYPRE_USING_UNIFIED_MEMORY if (hypre_VectorMemoryLocation(x) != HYPRE_MEMORY_DEVICE) { @@ -826,13 +926,13 @@ hypre_SeqVectorPrefetch( hypre_Vector *x, HYPRE_MemoryLocation memory_location) if (size == 0) { - return ierr; + return hypre_error_flag; } hypre_MemPrefetch(x_data, sizeof(HYPRE_Complex)*size, memory_location); #endif - return ierr; + return hypre_error_flag; } #if 0 @@ -850,7 +950,6 @@ hypre_SeqVectorMax( HYPRE_Complex alpha, HYPRE_Complex *x_data = hypre_VectorData(x); HYPRE_Complex *y_data = hypre_VectorData(y); HYPRE_Int size = hypre_VectorSize(x); - HYPRE_Int ierr = 0; size *= hypre_VectorNumVectors(x); @@ -886,6 +985,6 @@ hypre_SeqVectorMax( HYPRE_Complex alpha, hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); #endif - return ierr; + return hypre_error_flag; } #endif diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 1fc849dba..8b02ad34c 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1785,10 +1785,13 @@ HYPRE_Int hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Comple HYPRE_Int hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y); HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val); +HYPRE_Int hypreDevice_IVAMXPMY(HYPRE_Int m, HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y); HYPRE_Int hypreDevice_IntFilln(HYPRE_Int *d_x, size_t n, HYPRE_Int v); HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v); HYPRE_Int hypreDevice_ComplexFilln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); HYPRE_Int hypreDevice_CharFilln(char *d_x, size_t n, char v); +HYPRE_Int hypreDevice_IntStridedCopy ( HYPRE_Int size, HYPRE_Int stride, + HYPRE_Int *in, HYPRE_Int *out ); HYPRE_Int hypreDevice_IntScalen(HYPRE_Int *d_x, size_t n, HYPRE_Int *d_y, HYPRE_Int v); HYPRE_Int hypreDevice_ComplexScalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex *d_y, HYPRE_Complex v); diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index f9668ba34..bad4bf7c3 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -571,6 +571,101 @@ hypreDevice_IVAXPYMarked( HYPRE_Int n, return hypre_error_flag; } +/*-------------------------------------------------------------------------- + * hypreGPUKernel_IVAMXPMY + * + * Device kernel for hypreDevice_IVAMXPMY. The template argument MM tells + * the maximum number of vectors in the unrolled loop + *--------------------------------------------------------------------------*/ + +template +__global__ void +hypreGPUKernel_IVAMXPMY( hypre_DeviceItem &item, + HYPRE_Int m, + HYPRE_Int n, + HYPRE_Complex *a, + HYPRE_Complex *x, + HYPRE_Complex *y) +{ + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1, 1>(item); + + if (i < n) + { + HYPRE_Complex val = 1.0 / a[i]; + if (MM > 0) + { +#pragma unroll + for (HYPRE_Int j = 0; j < MM; j++) + { + y[i + j * n] += x[i + j * n] * val; + } + } + else + { + /* Generic case */ + for (HYPRE_Int j = 0; j < m; j++) + { + y[i + j * n] += x[i + j * n] * val; + } + } + } +} + +/*-------------------------------------------------------------------------- + * hypreDevice_IVAMXPMY + * + * Inverse Vector AXPY for m vectors x and y of size n stored column-wise: + * + * y[i + 0] += x[i + 0] / a[i] + * y[i + n] += x[i + n] / a[i] + * ... ... + * y[i + (m-1)*n] += x[i + (m-1)*n] / a[i] + * + * Note: does not work for row-wise multivectors + *--------------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_IVAMXPMY( HYPRE_Int m, + HYPRE_Int n, + HYPRE_Complex *a, + HYPRE_Complex *x, + HYPRE_Complex *y) +{ + /* trivial case */ + if (n <= 0) + { + return hypre_error_flag; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); + + switch (m) + { + case 1: + HYPRE_GPU_LAUNCH( hypreGPUKernel_IVAXPY, gDim, bDim, n, a, x, y ); + break; + + case 2: + HYPRE_GPU_LAUNCH( hypreGPUKernel_IVAMXPMY<2>, gDim, bDim, m, n, a, x, y ); + break; + + case 3: + HYPRE_GPU_LAUNCH( hypreGPUKernel_IVAMXPMY<3>, gDim, bDim, m, n, a, x, y ); + break; + + case 4: + HYPRE_GPU_LAUNCH( hypreGPUKernel_IVAMXPMY<4>, gDim, bDim, m, n, a, x, y ); + break; + + default: + HYPRE_GPU_LAUNCH( hypreGPUKernel_IVAMXPMY<0>, gDim, bDim, m, n, a, x, y ); + break; + } + + return hypre_error_flag; +} + /*-------------------------------------------------------------------- * hypreDevice_CsrRowPtrsToIndices *--------------------------------------------------------------------*/ @@ -1618,87 +1713,6 @@ hypre_ResetDeviceRandGenerator( hypre_ulonglongint seed, #endif /* #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND) || defined(HYPRE_USING_ONEMKLRAND) */ -/*-------------------------------------------------------------------- - * hypreGPUKernel_filln - *--------------------------------------------------------------------*/ - -template -__global__ void -hypreGPUKernel_filln(hypre_DeviceItem &item, T *x, size_t n, T v) -{ - HYPRE_Int i = hypre_gpu_get_grid_thread_id<1, 1>(item); - - if (i < n) - { - x[i] = v; - } -} - -/*-------------------------------------------------------------------- - * hypreDevice_Filln - *--------------------------------------------------------------------*/ - -template -HYPRE_Int -hypreDevice_Filln(T *d_x, size_t n, T v) -{ -#if 0 - HYPRE_THRUST_CALL( fill_n, d_x, n, v); -#else - if (n <= 0) - { - return hypre_error_flag; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_filln, gDim, bDim, d_x, n, v ); -#endif - - return hypre_error_flag; -} - -/*-------------------------------------------------------------------- - * hypreDevice_ComplexFilln - *--------------------------------------------------------------------*/ - -HYPRE_Int -hypreDevice_ComplexFilln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v) -{ - return hypreDevice_Filln(d_x, n, v); -} - -/*-------------------------------------------------------------------- - * hypreDevice_CharFilln - *--------------------------------------------------------------------*/ - -HYPRE_Int -hypreDevice_CharFilln(char *d_x, size_t n, char v) -{ - return hypreDevice_Filln(d_x, n, v); -} - -/*-------------------------------------------------------------------- - * hypreDevice_IntFilln - *--------------------------------------------------------------------*/ - -HYPRE_Int -hypreDevice_IntFilln(HYPRE_Int *d_x, size_t n, HYPRE_Int v) -{ - return hypreDevice_Filln(d_x, n, v); -} - -/*-------------------------------------------------------------------- - * hypreDevice_BigIntFilln - *--------------------------------------------------------------------*/ - -HYPRE_Int -hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v) -{ - return hypreDevice_Filln(d_x, n, v); -} - #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1868,6 +1882,158 @@ hypreDevice_ComplexScalen( HYPRE_Complex *d_x, return hypreDevice_Scalen(d_x, n, d_y, v); } +/*-------------------------------------------------------------------- + * hypreGPUKernel_filln + *--------------------------------------------------------------------*/ + +template +__global__ void +hypreGPUKernel_filln(hypre_DeviceItem &item, T *x, size_t n, T v) +{ + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1, 1>(item); + + if (i < n) + { + x[i] = v; + } +} + +/*-------------------------------------------------------------------- + * hypreDevice_Filln + *--------------------------------------------------------------------*/ + +template +HYPRE_Int +hypreDevice_Filln(T *d_x, size_t n, T v) +{ +#if 0 + HYPRE_THRUST_CALL( fill_n, d_x, n, v); +#else + if (n <= 0) + { + return hypre_error_flag; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); + + HYPRE_GPU_LAUNCH( hypreGPUKernel_filln, gDim, bDim, d_x, n, v ); +#endif + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------- + * hypreDevice_ComplexFilln + *--------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_ComplexFilln( HYPRE_Complex *d_x, + size_t n, + HYPRE_Complex v ) +{ + return hypreDevice_Filln(d_x, n, v); +} + +/*-------------------------------------------------------------------- + * hypreDevice_CharFilln + *--------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_CharFilln( char *d_x, + size_t n, + char v ) +{ + return hypreDevice_Filln(d_x, n, v); +} + +/*-------------------------------------------------------------------- + * hypreDevice_IntFilln + *--------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_IntFilln( HYPRE_Int *d_x, + size_t n, + HYPRE_Int v ) +{ + return hypreDevice_Filln(d_x, n, v); +} + +/*-------------------------------------------------------------------- + * hypreDevice_BigIntFilln + *--------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_BigIntFilln( HYPRE_BigInt *d_x, + size_t n, + HYPRE_BigInt v) +{ + return hypreDevice_Filln(d_x, n, v); +} + +/*-------------------------------------------------------------------- + * hypreGPUKernel_StridedCopy + *--------------------------------------------------------------------*/ + +template +__global__ void +hypreGPUKernel_StridedCopy(hypre_DeviceItem &item, + HYPRE_Int size, + HYPRE_Int stride, + T *in, + T *out ) +{ + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1, 1>(item); + + if (i < size) + { + out[i] = in[i * stride]; + } +} + +/*-------------------------------------------------------------------- + * hypreDevice_StridedCopy + *--------------------------------------------------------------------*/ + +template +HYPRE_Int +hypreDevice_StridedCopy( HYPRE_Int size, + HYPRE_Int stride, + T *in, + T *out ) +{ + if (size < 1 || stride < 1) + { + return hypre_error_flag; + } + + if (in == out) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Cannot perform in-place strided copy"); + return hypre_error_flag; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim); + + HYPRE_GPU_LAUNCH( hypreGPUKernel_StridedCopy, gDim, bDim, size, stride, in, out ); + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------- + * hypreDevice_IntStridedCopy + *--------------------------------------------------------------------*/ + +HYPRE_Int +hypreDevice_IntStridedCopy( HYPRE_Int size, + HYPRE_Int stride, + HYPRE_Int *in, + HYPRE_Int *out ) +{ + return hypreDevice_StridedCopy(size, stride, in, out); +} + /*-------------------------------------------------------------------- * hypreDevice_CsrRowPtrsToIndicesWithRowNum * diff --git a/src/utilities/protos.h b/src/utilities/protos.h index 43ab82ce8..9e4e048f4 100644 --- a/src/utilities/protos.h +++ b/src/utilities/protos.h @@ -282,10 +282,13 @@ HYPRE_Int hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Comple HYPRE_Int hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y); HYPRE_Int hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val); +HYPRE_Int hypreDevice_IVAMXPMY(HYPRE_Int m, HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y); HYPRE_Int hypreDevice_IntFilln(HYPRE_Int *d_x, size_t n, HYPRE_Int v); HYPRE_Int hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v); HYPRE_Int hypreDevice_ComplexFilln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v); HYPRE_Int hypreDevice_CharFilln(char *d_x, size_t n, char v); +HYPRE_Int hypreDevice_IntStridedCopy ( HYPRE_Int size, HYPRE_Int stride, + HYPRE_Int *in, HYPRE_Int *out ); HYPRE_Int hypreDevice_IntScalen(HYPRE_Int *d_x, size_t n, HYPRE_Int *d_y, HYPRE_Int v); HYPRE_Int hypreDevice_ComplexScalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex *d_y, HYPRE_Complex v);