Consolidate multiple implementations of divup/div_up/div_ceil.
This commit is contained in:
		
							parent
							
								
									e8515f78ac
								
							
						
					
					
						commit
						a96545777b
					
				| @ -1341,6 +1341,19 @@ double ceil(const double &x) { return ::ceil(x); } | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| // Integer division with rounding up.
 | ||||
| // T is assumed to be an integer type with a>=0, and b>0
 | ||||
| template<typename T> | ||||
| EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR | ||||
| T div_ceil(const T &a, const T &b) | ||||
| { | ||||
|   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) | ||||
|   eigen_assert(a >= 0); | ||||
|   eigen_assert(b > 0); | ||||
|   // Note: This form is used because it cannot overflow.
 | ||||
|   return a == 0 ? 0 : (a - 1) / b + 1; | ||||
| } | ||||
| 
 | ||||
| /** Log base 2 for 32 bits positive integers.
 | ||||
|   * Conveniently returns 0 for x==0. */ | ||||
| inline int log2(int x) | ||||
|  | ||||
| @ -90,8 +90,6 @@ class gemm_class { | ||||
|   const Index a_stride, b_stride; | ||||
|   const Index a_off, b_off; | ||||
| 
 | ||||
|   static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; } | ||||
| 
 | ||||
|   EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) { | ||||
|     _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0); | ||||
|   } | ||||
| @ -479,7 +477,7 @@ class gemm_class { | ||||
|    * | ||||
|    *              const Scalar *cox = (idx == 0) ? co1 : co2; | ||||
|    * | ||||
|    *              const int um_vecs = div_up(a_unroll, nelems_in_cache_line); | ||||
|    *              const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); | ||||
|    *              scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); | ||||
|    *              write_c<0, um_vecs, idx, a_unroll>(cox); | ||||
|    * | ||||
| @ -498,7 +496,7 @@ class gemm_class { | ||||
|   EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) { | ||||
|     if (pow >= 4) cox += ldc; | ||||
| 
 | ||||
|     const int um_vecs = div_up(a_unroll, nelems_in_cache_line); | ||||
|     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); | ||||
|     auto &alpha_reg = zmm[alpha_load_reg]; | ||||
| 
 | ||||
|     scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); | ||||
| @ -644,7 +642,7 @@ class gemm_class { | ||||
|   template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false> | ||||
|   EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo, | ||||
|                                            Scalar *&co2, int &fetchA_idx, int &fetchB_idx) { | ||||
|     const int um_vecs = div_up(a_unroll, nelems_in_cache_line); | ||||
|     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); | ||||
| 
 | ||||
|     if (max_b_unroll >= 1) | ||||
|       innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx); | ||||
| @ -729,7 +727,7 @@ class gemm_class { | ||||
| 
 | ||||
|   template <int a_unroll, int b_unroll, int max_b_unroll> | ||||
|   EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { | ||||
|     const int um_vecs = div_up(a_unroll, nelems_in_cache_line); | ||||
|     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); | ||||
|     if (!use_less_a_regs && k > 1) | ||||
|       a_loads<0, 2, 0, um_vecs, a_unroll>(ao); | ||||
|     else | ||||
|  | ||||
| @ -422,15 +422,6 @@ template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); | ||||
| 
 | ||||
| using std::numeric_limits; | ||||
| 
 | ||||
| // Integer division with rounding up.
 | ||||
| // T is assumed to be an integer type with a>=0, and b>0
 | ||||
| template<typename T> | ||||
| EIGEN_DEVICE_FUNC | ||||
| T div_ceil(const T &a, const T &b) | ||||
| { | ||||
|   return (a+b-1) / b; | ||||
| } | ||||
| 
 | ||||
| // Handle integer comparisons of different signedness.
 | ||||
| template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned, | ||||
|           bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned> | ||||
|  | ||||
| @ -443,7 +443,7 @@ class TensorBlockMapper { | ||||
|         const int dim = isColMajor ? i : NumDims - i - 1; | ||||
|         m_block_dimensions[dim] = | ||||
|             numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); | ||||
|         coeff_to_allocate = divup( | ||||
|         coeff_to_allocate = numext::div_ceil( | ||||
|             coeff_to_allocate, | ||||
|             numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim])); | ||||
|       } | ||||
| @ -474,7 +474,7 @@ class TensorBlockMapper { | ||||
|           const IndexType total_size_other_dims = | ||||
|               total_size / m_block_dimensions[dim]; | ||||
|           const IndexType alloc_avail = | ||||
|               divup<IndexType>(target_block_size, total_size_other_dims); | ||||
|               numext::div_ceil<IndexType>(target_block_size, total_size_other_dims); | ||||
|           if (alloc_avail == m_block_dimensions[dim]) { | ||||
|             // Insufficient excess coefficients to allocate.
 | ||||
|             break; | ||||
| @ -496,7 +496,7 @@ class TensorBlockMapper { | ||||
|     // Calculate block counts by dimension and total block count.
 | ||||
|     DSizes<IndexType, NumDims> block_count; | ||||
|     for (int i = 0; i < NumDims; ++i) { | ||||
|       block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); | ||||
|       block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]); | ||||
|     } | ||||
|     m_total_block_count = array_prod(block_count); | ||||
| 
 | ||||
|  | ||||
| @ -898,7 +898,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> | ||||
|       // First multiple after a. This is b when <= bcast_dim_left_index +
 | ||||
|       // bcast_dim_size.
 | ||||
|       const Index first_multiple = | ||||
|           divup<Index>(bcast_dim_left_index, input_bcast_dim_size) * | ||||
|           numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * | ||||
|           input_bcast_dim_size; | ||||
| 
 | ||||
|       if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { | ||||
|  | ||||
| @ -144,8 +144,8 @@ struct TensorContractionBlockMemAllocator { | ||||
|                                                               const Index bn) { | ||||
|     Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); | ||||
|     BlockSizes sz; | ||||
|     sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align; | ||||
|     sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align; | ||||
|     sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align; | ||||
|     sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align; | ||||
|     return sz; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @ -206,9 +206,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|     } | ||||
| 
 | ||||
|     // Number of kernels for each dimension.
 | ||||
|     Index nm0 = divup(m, bm); | ||||
|     Index nn0 = divup(n, bn); | ||||
|     Index nk = divup(k, bk); | ||||
|     Index nm0 = numext::div_ceil(m, bm); | ||||
|     Index nn0 = numext::div_ceil(n, bn); | ||||
|     Index nk = numext::div_ceil(k, bk); | ||||
| 
 | ||||
|     // Calculate task grain size (number of kernels executed per task).
 | ||||
|     // This task size coarsening serves two purposes:
 | ||||
| @ -226,8 +226,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|       gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); | ||||
|     } | ||||
|     // Number of tasks in each dimension.
 | ||||
|     Index nm = divup(nm0, gm); | ||||
|     Index nn = divup(nn0, gn); | ||||
|     Index nm = numext::div_ceil(nm0, gm); | ||||
|     Index nn = numext::div_ceil(nn0, gn); | ||||
| 
 | ||||
|     // If there is enough concurrency in the sharding dimension, we choose not
 | ||||
|     // to paralellize by the other dimension, and execute all kernels in sync
 | ||||
| @ -1130,9 +1130,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|           done(std::move(done_callback)), | ||||
|           buffer_size_bytes(m * n * sizeof(Scalar)), | ||||
|           block_size(blockSize(k, num_threads)), | ||||
|           num_blocks(divup<Index>(k, block_size)), | ||||
|           num_blocks(numext::div_ceil<Index>(k, block_size)), | ||||
|           num_pending_blocks(internal::convert_index<int>(num_blocks)), | ||||
|           l0_ranges(divup<Index>(num_blocks, l0_size)), | ||||
|           l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)), | ||||
|           l0_state(l0_ranges), | ||||
|           block_buffers(num_blocks) { | ||||
|       // Keep count of pending gemm tasks for each l0 range.
 | ||||
| @ -1434,10 +1434,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|     static Index blockSize(Index k, int num_threads) { | ||||
|       const auto round_up = [=](Index index) -> Index { | ||||
|         const Index kmultiple = packet_size <= 8 ? 8 : packet_size; | ||||
|         return divup<Index>(index, kmultiple) * kmultiple; | ||||
|         return numext::div_ceil<Index>(index, kmultiple) * kmultiple; | ||||
|       }; | ||||
| 
 | ||||
|       const Index target_block_size = round_up(divup<Index>(k, num_threads)); | ||||
|       const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads)); | ||||
|       const Index desired_min_block_size = 12 * packet_size; | ||||
| 
 | ||||
|       return numext::mini<Index>( | ||||
| @ -1485,19 +1485,19 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|                  int num_threads, bool shard_by_col) const { | ||||
|     Index gm = 1; | ||||
|     Index gm1 = 1; | ||||
|     Index nm0 = divup(m, bm); | ||||
|     Index nm0 = numext::div_ceil(m, bm); | ||||
|     Index nm1 = nm0; | ||||
|     for (;;) { | ||||
|       // Find the next candidate for m grain size. It needs to result in
 | ||||
|       // different number of blocks. E.g. if we have 10 kernels, we want to try
 | ||||
|       // 5 and 10, but not 6, 7, 8 and 9.
 | ||||
|       while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; | ||||
|       while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++; | ||||
|       if (gm1 > nm0) break; | ||||
|       // Check the candidate.
 | ||||
|       int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, | ||||
|                            shard_by_col); | ||||
|       if (res < 0) break; | ||||
|       nm1 = divup(nm0, gm1); | ||||
|       nm1 = numext::div_ceil(nm0, gm1); | ||||
|       if (res == 0) continue; | ||||
|       // Commit new grain size.
 | ||||
|       gm = gm1; | ||||
| @ -1509,15 +1509,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|                  int num_threads, bool shard_by_col) const { | ||||
|     Index gn = 1; | ||||
|     Index gn1 = 1; | ||||
|     Index nn0 = divup(n, bn); | ||||
|     Index nn0 = numext::div_ceil(n, bn); | ||||
|     Index nn1 = nn0; | ||||
|     for (;;) { | ||||
|       while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; | ||||
|       while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++; | ||||
|       if (gn1 > nn0) break; | ||||
|       int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, | ||||
|                            shard_by_col); | ||||
|       if (res < 0) break; | ||||
|       nn1 = divup(nn0, gn1); | ||||
|       nn1 = numext::div_ceil(nn0, gn1); | ||||
|       if (res == 0) continue; | ||||
|       gn = gn1; | ||||
|     } | ||||
| @ -1544,14 +1544,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|     // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
 | ||||
|     // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
 | ||||
|     // us parallelism of 1 (we can load all cores).
 | ||||
|     Index nm0 = divup(m, bm); | ||||
|     Index nn0 = divup(n, bn); | ||||
|     Index new_tasks = divup(nm0, gm) * divup(nn0, gn); | ||||
|     Index nm0 = numext::div_ceil(m, bm); | ||||
|     Index nn0 = numext::div_ceil(n, bn); | ||||
|     Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn); | ||||
|     double new_parallelism = static_cast<double>(new_tasks) / | ||||
|                              (divup<int>(new_tasks, num_threads) * num_threads); | ||||
|     Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); | ||||
|                              (numext::div_ceil<int>(new_tasks, num_threads) * num_threads); | ||||
|     Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn); | ||||
|     double old_parallelism = static_cast<double>(old_tasks) / | ||||
|                              (divup<int>(old_tasks, num_threads) * num_threads); | ||||
|                              (numext::div_ceil<int>(old_tasks, num_threads) * num_threads); | ||||
|     if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; | ||||
|     return 0; | ||||
|   } | ||||
|  | ||||
| @ -223,7 +223,7 @@ struct ThreadPoolDevice { | ||||
|                                                   Index lastIdx) { | ||||
|       while (lastIdx - firstIdx > block.size) { | ||||
|         // Split into halves and schedule the second half on a different thread.
 | ||||
|         const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; | ||||
|         const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; | ||||
|         pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); | ||||
|         lastIdx = midIdx; | ||||
|       } | ||||
| @ -282,7 +282,7 @@ struct ThreadPoolDevice { | ||||
|     ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { | ||||
|       while (lastIdx - firstIdx > block.size) { | ||||
|         // Split into halves and schedule the second half on a different thread.
 | ||||
|         const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; | ||||
|         const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; | ||||
|         pool_->Schedule( | ||||
|             [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); | ||||
|         lastIdx = midIdx; | ||||
| @ -357,7 +357,7 @@ struct ThreadPoolDevice { | ||||
|     const Index max_oversharding_factor = 4; | ||||
|     Index block_size = numext::mini( | ||||
|         n, numext::maxi<Index>( | ||||
|                divup<Index>(n, max_oversharding_factor * numThreads()), | ||||
|                numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()), | ||||
|                block_size_f)); | ||||
|     const Index max_block_size = numext::mini(n, 2 * block_size); | ||||
| 
 | ||||
| @ -367,13 +367,13 @@ struct ThreadPoolDevice { | ||||
|       block_size = numext::mini(n, new_block_size); | ||||
|     } | ||||
| 
 | ||||
|     Index block_count = divup(n, block_size); | ||||
|     Index block_count = numext::div_ceil(n, block_size); | ||||
| 
 | ||||
|     // Calculate parallel efficiency as fraction of total CPU time used for
 | ||||
|     // computations:
 | ||||
|     double max_efficiency = | ||||
|         static_cast<double>(block_count) / | ||||
|         (divup<int>(block_count, numThreads()) * numThreads()); | ||||
|         (numext::div_ceil<int>(block_count, numThreads()) * numThreads()); | ||||
| 
 | ||||
|     // Now try to increase block size up to max_block_size as long as it
 | ||||
|     // doesn't decrease parallel efficiency.
 | ||||
| @ -381,7 +381,7 @@ struct ThreadPoolDevice { | ||||
|          max_efficiency < 1.0 && prev_block_count > 1;) { | ||||
|       // This is the next block size that divides size into a smaller number
 | ||||
|       // of blocks than the current block_size.
 | ||||
|       Index coarser_block_size = divup(n, prev_block_count - 1); | ||||
|       Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1); | ||||
|       if (block_align) { | ||||
|         Index new_block_size = block_align(coarser_block_size); | ||||
|         eigen_assert(new_block_size >= coarser_block_size); | ||||
| @ -391,12 +391,12 @@ struct ThreadPoolDevice { | ||||
|         break;  // Reached max block size. Stop.
 | ||||
|       } | ||||
|       // Recalculate parallel efficiency.
 | ||||
|       const Index coarser_block_count = divup(n, coarser_block_size); | ||||
|       const Index coarser_block_count = numext::div_ceil(n, coarser_block_size); | ||||
|       eigen_assert(coarser_block_count < prev_block_count); | ||||
|       prev_block_count = coarser_block_count; | ||||
|       const double coarser_efficiency = | ||||
|           static_cast<double>(coarser_block_count) / | ||||
|           (divup<int>(coarser_block_count, numThreads()) * numThreads()); | ||||
|           (numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads()); | ||||
|       if (coarser_efficiency + 0.01 >= max_efficiency) { | ||||
|         // Taking it.
 | ||||
|         block_size = coarser_block_size; | ||||
|  | ||||
| @ -261,7 +261,7 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext( | ||||
|   const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); | ||||
|   const size_t aligned_blocksize = | ||||
|       align * | ||||
|       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); | ||||
|       numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); | ||||
| 
 | ||||
|   return {block_mapper, requirements.cost_per_coeff * block_size, | ||||
|           aligned_blocksize}; | ||||
| @ -661,7 +661,7 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til | ||||
|         block_size; | ||||
|     const StorageIndex size = array_prod(evaluator.dimensions()); | ||||
|     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
 | ||||
|     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); | ||||
|     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1); | ||||
| 
 | ||||
|     LAUNCH_GPU_KERNEL( | ||||
|         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), | ||||
|  | ||||
| @ -27,21 +27,6 @@ const T2& choose(Cond<false>, const T1&, const T2& second) { | ||||
|   return second; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| template <typename T, typename X, typename Y> | ||||
| EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE | ||||
| T divup(const X x, const Y y) { | ||||
|   // Note: This form is used because it cannot overflow.
 | ||||
|   return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE | ||||
| T divup(const T x, const T y) { | ||||
|   // Note: This form is used because it cannot overflow.
 | ||||
|   return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1); | ||||
| } | ||||
| 
 | ||||
| template <size_t n> struct max_n_1 { | ||||
|   static const size_t size = n; | ||||
| }; | ||||
| @ -49,6 +34,11 @@ template <> struct max_n_1<0> { | ||||
|   static const size_t size = 1; | ||||
| }; | ||||
| 
 | ||||
| template <typename T> | ||||
| EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE | ||||
| constexpr T divup(const T x, const T y) { | ||||
|   return Eigen::numext::div_ceil(x, y); | ||||
| } | ||||
| 
 | ||||
| // Default packet types
 | ||||
| template <typename Scalar, typename Device> | ||||
|  | ||||
| @ -414,7 +414,7 @@ struct FullReductionLauncher< | ||||
|     typedef typename Self::Index Index; | ||||
|     const int block_size = 256; | ||||
|     const int num_per_thread = 128; | ||||
|     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); | ||||
| 
 | ||||
|     unsigned int* semaphore = NULL; | ||||
|     if (num_blocks > 1) { | ||||
| @ -441,7 +441,7 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> { | ||||
| 
 | ||||
|     const int block_size = 256; | ||||
|     const int num_per_thread = 128; | ||||
|     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); | ||||
|     half* scratch = static_cast<half*>(device.scratchpad()); | ||||
| 
 | ||||
|     if (num_blocks > 1) { | ||||
| @ -507,7 +507,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce | ||||
|   const int unroll_times = 16; | ||||
|   eigen_assert(NumPerThread % unroll_times == 0); | ||||
| 
 | ||||
|   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); | ||||
|   const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); | ||||
|   const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; | ||||
| 
 | ||||
|   const Index num_threads = blockDim.x * gridDim.x; | ||||
| @ -593,8 +593,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc | ||||
|   eigen_assert(NumPerThread % unroll_times == 0); | ||||
|   eigen_assert(unroll_times % 2 == 0); | ||||
| 
 | ||||
|   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); | ||||
|   const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); | ||||
|   const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); | ||||
|   const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2); | ||||
| 
 | ||||
|   const Index num_threads = blockDim.x * gridDim.x; | ||||
|   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; | ||||
| @ -785,7 +785,7 @@ struct InnerReductionLauncher< | ||||
|     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; | ||||
|     const int block_size = 256; | ||||
|     const int num_per_thread = 128; | ||||
|     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int max_blocks = device.getNumGpuMultiProcessors() * | ||||
|                            device.maxGpuThreadsPerMultiProcessor() / block_size; | ||||
|     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); | ||||
| @ -793,7 +793,7 @@ struct InnerReductionLauncher< | ||||
|     if (num_blocks > 1) { | ||||
|       // We initialize the outputs outside the reduction kernel when we can't be sure that there
 | ||||
|       // won't be a race conditions between multiple thread blocks.
 | ||||
|       const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024); | ||||
|       const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024); | ||||
|       const int max_blocks2 = device.getNumGpuMultiProcessors() * | ||||
|                            device.maxGpuThreadsPerMultiProcessor() / 1024; | ||||
|       const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2); | ||||
| @ -831,7 +831,7 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> { | ||||
|     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; | ||||
|     const int block_size = /*256*/128; | ||||
|     const int num_per_thread = /*128*/64; | ||||
|     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int max_blocks = device.getNumGpuMultiProcessors() * | ||||
|                            device.maxGpuThreadsPerMultiProcessor() / block_size; | ||||
|     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); | ||||
| @ -900,7 +900,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reduce | ||||
|   } | ||||
| 
 | ||||
|   // Do the reduction.
 | ||||
|   const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); | ||||
|   const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread); | ||||
|   for (Index i = thread_id; i < max_iter; i += num_threads) { | ||||
|     const Index input_col = i % num_preserved_coeffs; | ||||
|     const Index input_row = (i / num_preserved_coeffs) * NumPerThread; | ||||
| @ -953,7 +953,7 @@ struct OuterReducer<Self, Op, GpuDevice> { | ||||
|     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; | ||||
|     const int block_size = 256; | ||||
|     const int num_per_thread = 16; | ||||
|     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); | ||||
|     const int max_blocks = device.getNumGpuMultiProcessors() * | ||||
|                            device.maxGpuThreadsPerMultiProcessor() / block_size; | ||||
|     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); | ||||
| @ -961,7 +961,7 @@ struct OuterReducer<Self, Op, GpuDevice> { | ||||
|     if (num_blocks > 1) { | ||||
|       // We initialize the outputs in the reduction kernel itself when we don't have to worry
 | ||||
|       // about race conditions between multiple thread blocks.
 | ||||
|       const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024); | ||||
|       const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024); | ||||
|       const int max_blocks2 = device.getNumGpuMultiProcessors() * | ||||
|                              device.maxGpuThreadsPerMultiProcessor() / 1024; | ||||
|       const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2); | ||||
|  | ||||
| @ -215,7 +215,7 @@ EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { | ||||
|   EIGEN_CONSTEXPR Index kBlockAlignment = 128; | ||||
|   const Index items_per_cacheline = | ||||
|       numext::maxi<Index>(1, kBlockAlignment / item_size); | ||||
|   return items_per_cacheline * divup(block_size, items_per_cacheline); | ||||
|   return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); | ||||
| } | ||||
| 
 | ||||
| template <typename Self> | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Rasmus Munk Larsen
						Rasmus Munk Larsen