Fix some typos found
This commit is contained in:
		
							parent
							
								
									76bb29c0c2
								
							
						
					
					
						commit
						afa616bc9e
					
				| @ -23,7 +23,7 @@ namespace internal { | ||||
|     outside of which tanh(x) = +/-1 in single precision. The input is clamped | ||||
|     to the range [-c, c]. The value c is chosen as the smallest value where | ||||
|     the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] | ||||
|     the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. | ||||
|     the approximation tanh(x) ~= x is used for better accuracy as x tends to zero. | ||||
| 
 | ||||
|     This implementation works on both scalars and packets. | ||||
| */ | ||||
|  | ||||
| @ -31,7 +31,7 @@ namespace internal { | ||||
| *    some (optional) processing of the outcome, e.g., division by n for mean. | ||||
| * | ||||
| * For the vectorized path let's observe that the packet-size and outer-unrolling | ||||
| * are both decided by the assignement logic. So all we have to do is to decide | ||||
| * are both decided by the assignment logic. So all we have to do is to decide | ||||
| * on the inner unrolling. | ||||
| * | ||||
| * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, | ||||
|  | ||||
| @ -596,7 +596,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp | ||||
|       return m_matrix += extendedTo(other.derived()); | ||||
|     } | ||||
| 
 | ||||
|     /** Substracts the vector \a other to each subvector of \c *this */ | ||||
|     /** Subtracts the vector \a other to each subvector of \c *this */ | ||||
|     template<typename OtherDerived> | ||||
|     EIGEN_DEVICE_FUNC | ||||
|     ExpressionType& operator-=(const DenseBase<OtherDerived>& other) | ||||
| @ -606,7 +606,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp | ||||
|       return m_matrix -= extendedTo(other.derived()); | ||||
|     } | ||||
| 
 | ||||
|     /** Multiples each subvector of \c *this by the vector \a other */ | ||||
|     /** Multiplies each subvector of \c *this by the vector \a other */ | ||||
|     template<typename OtherDerived> | ||||
|     EIGEN_DEVICE_FUNC | ||||
|     ExpressionType& operator*=(const DenseBase<OtherDerived>& other) | ||||
|  | ||||
| @ -2234,7 +2234,7 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { | ||||
| 
 | ||||
| #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) | ||||
|   // Since GCC 10.1 supports avx512bf16 and C style explicit cast
 | ||||
|   // (C++ static_cast is not supported yet), do converion via intrinsic
 | ||||
|   // (C++ static_cast is not supported yet), do conversion via intrinsic
 | ||||
|   // and register path for performance.
 | ||||
|   r = (__m256i)(_mm512_cvtneps_pbh(a)); | ||||
| 
 | ||||
|  | ||||
| @ -572,7 +572,7 @@ inline float trig_reduce_huge (float xf, int *quadrant) | ||||
|   using Eigen::numext::uint64_t; | ||||
| 
 | ||||
|   const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
 | ||||
|   const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
 | ||||
|   const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
 | ||||
| 
 | ||||
|   // 192 bits of 2/pi for Payne-Hanek reduction
 | ||||
|   // Bits are introduced by packet of 8 to enable aligned reads.
 | ||||
|  | ||||
| @ -3461,7 +3461,7 @@ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) { | ||||
| 
 | ||||
| EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) | ||||
| { | ||||
|   // See the scalar implemention in BFloat16.h for a comprehensible explanation
 | ||||
|   // See the scalar implementation in BFloat16.h for a comprehensible explanation
 | ||||
|   // of this fast rounding algorithm
 | ||||
|   Packet4ui input = reinterpret_cast<Packet4ui>(p); | ||||
| 
 | ||||
|  | ||||
| @ -624,7 +624,7 @@ | ||||
| #define EIGEN_CPLUSPLUS 0 | ||||
| #endif | ||||
| 
 | ||||
| // The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
 | ||||
| // The macro EIGEN_COMP_CXXVER defines the c++ version expected by the compiler.
 | ||||
| // For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
 | ||||
| // is defined to 17.
 | ||||
| #if EIGEN_CPLUSPLUS > 201703L | ||||
|  | ||||
| @ -1,5 +1,5 @@ | ||||
| #ifdef EIGEN_WARNINGS_DISABLED_2 | ||||
| // "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
 | ||||
| // "DisableStupidWarnings.h" was included twice recursively: Do not re-enable warnings yet!
 | ||||
| #  undef EIGEN_WARNINGS_DISABLED_2 | ||||
| 
 | ||||
| #elif defined(EIGEN_WARNINGS_DISABLED) | ||||
| @ -17,7 +17,7 @@ | ||||
|   #endif | ||||
| 
 | ||||
|   #if defined __NVCC__ | ||||
| //    Don't reenable the diagnostic messages, as it turns out these messages need
 | ||||
| //    Don't re-enable the diagnostic messages, as it turns out these messages need
 | ||||
| //    to be disabled at the point of the template instantiation (i.e the user code)
 | ||||
| //    otherwise they'll be triggered by nvcc.
 | ||||
| //    #pragma diag_default code_is_unreachable
 | ||||
|  | ||||
| @ -20,7 +20,7 @@ The build stage consists of the following jobs: | ||||
| 
 | ||||
| In principle every build-job has a corresponding test-job, however testing supported and unsupported modules is divided into separate jobs. The test jobs in detail: | ||||
| 
 | ||||
| ### Job dependecies | ||||
| ### Job dependencies | ||||
| 
 | ||||
| | Job Name                                            | Arch      | OS             | Compiler   | C++11   | Module | ||||
| |-----------------------------------------------------|-----------|----------------|------------|---------|-------- | ||||
|  | ||||
| @ -889,7 +889,7 @@ void packetmath_real() { | ||||
|         data1[0] = std::numeric_limits<Scalar>::denorm_min(); | ||||
|         data1[1] = -std::numeric_limits<Scalar>::denorm_min(); | ||||
|         h.store(data2, internal::plog(h.load(data1))); | ||||
|         // TODO(rmlarsen): Reenable.
 | ||||
|         // TODO(rmlarsen): Re-enable.
 | ||||
|         //        VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
 | ||||
|         VERIFY((numext::isnan)(data2[1])); | ||||
|       } | ||||
|  | ||||
| @ -41,7 +41,7 @@ template<typename ArrayType> void vectorwiseop_array(const ArrayType& m) | ||||
|   VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec); | ||||
|   VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec); | ||||
| 
 | ||||
|   // test substraction
 | ||||
|   // test subtraction
 | ||||
|   m2 = m1; | ||||
|   m2.colwise() -= colvec; | ||||
|   VERIFY_IS_APPROX(m2, m1.colwise() - colvec); | ||||
| @ -142,7 +142,7 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m) | ||||
|   VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec); | ||||
| 
 | ||||
| 
 | ||||
|   // test substraction
 | ||||
|   // test subtraction
 | ||||
|   m2 = m1; | ||||
|   m2.colwise() -= colvec; | ||||
|   VERIFY_IS_APPROX(m2, m1.colwise() - colvec); | ||||
|  | ||||
| @ -107,7 +107,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> | ||||
|   typedef typename XprType::CoeffReturnType CoeffReturnType; | ||||
|   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; | ||||
|   static const int PacketSize = PacketType<CoeffReturnType, Device>::size; | ||||
|   protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
 | ||||
|   protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be standard layout;
 | ||||
|   bool isCopy, nByOne, oneByN; | ||||
|   public: | ||||
|   typedef StorageMemory<CoeffReturnType, Device> Storage; | ||||
|  | ||||
| @ -112,7 +112,7 @@ struct TTPanelSize { | ||||
|   // BC : determines if supporting bank conflict is required
 | ||||
|   static EIGEN_CONSTEXPR bool BC = true; | ||||
|   // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
 | ||||
|   // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
 | ||||
|   // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
 | ||||
|   static EIGEN_CONSTEXPR bool DoubleBuffer = | ||||
| #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER | ||||
|       false; | ||||
| @ -430,7 +430,7 @@ struct ThreadProperties { | ||||
|  Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny | ||||
|  contraction is used. So in this case, a final reduction step is required to compute final output. | ||||
| 
 | ||||
|  * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of | ||||
|  * \tparam contraction_tp: it is an enum value representing whether the local memory/no local memory implementation of | ||||
|  the algorithm to be used | ||||
|  * | ||||
|  * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group | ||||
| @ -495,7 +495,7 @@ class TensorContractionKernel { | ||||
|    * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out | ||||
|    * different type of memory needed when local/no_local memory computation is called. | ||||
|    * | ||||
|    * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation | ||||
|    * \tparam contraction_type: it is an enum value representing whether the local memory/no local memory implementation | ||||
|    of the algorithm to be used | ||||
|    * \tparam the private memory size | ||||
|    * \param ptr the tile memory pointer type | ||||
|  | ||||
| @ -897,7 +897,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|         } else { | ||||
|           // If we can't guarantee that all kernels in `k` slice will be
 | ||||
|           // executed sequentially in current thread, it's no longer safe to use
 | ||||
|           // thread local memory in followig slices along the k dimensions.
 | ||||
|           // thread local memory in following slices along the k dimensions.
 | ||||
|           eigen_assert(k > 0); | ||||
|           can_use_thread_local_packed_[n].store(false, | ||||
|                                                 std::memory_order_relaxed); | ||||
|  | ||||
| @ -715,7 +715,7 @@ class QueueInterface { | ||||
|   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } | ||||
| 
 | ||||
|   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { | ||||
|     // OpenCL doesnot have such concept
 | ||||
|     // OpenCL does not have such a concept
 | ||||
|     return 2; | ||||
|   } | ||||
| 
 | ||||
| @ -1035,7 +1035,7 @@ struct SyclDevice : public SyclDeviceBase { | ||||
|     return queue_stream()->maxWorkItemSizes(); | ||||
|   } | ||||
|   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { | ||||
|     // OpenCL doesnot have such concept
 | ||||
|     // OpenCL does not have such a concept
 | ||||
|     return queue_stream()->maxSyclThreadsPerMultiProcessor(); | ||||
|   } | ||||
|   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { | ||||
|  | ||||
| @ -133,7 +133,7 @@ template <typename T> class UniformRandomGenerator { | ||||
|     m_state = PCG_XSH_RS_state(seed); | ||||
|     #ifdef EIGEN_USE_SYCL | ||||
|     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
 | ||||
|     // Therefor, we need two step to initializate the m_state.
 | ||||
|     // Therefore, we need two steps to initializate the m_state.
 | ||||
|     // IN SYCL, the constructor of the functor is s called on the CPU
 | ||||
|     // and we get the clock seed here from the CPU. However, This seed is
 | ||||
|     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
 | ||||
| @ -246,7 +246,7 @@ template <typename T> class NormalRandomGenerator { | ||||
|     m_state = PCG_XSH_RS_state(seed); | ||||
|     #ifdef EIGEN_USE_SYCL | ||||
|     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
 | ||||
|     // Therefor, we need two steps to initializate the m_state.
 | ||||
|     // Therefore, we need two steps to initializate the m_state.
 | ||||
|     // IN SYCL, the constructor of the functor is s called on the CPU
 | ||||
|     // and we get the clock seed here from the CPU. However, This seed is
 | ||||
|     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
 | ||||
|  | ||||
| @ -25,7 +25,7 @@ | ||||
|  * buffer is given as an input and all the threads within a work-group scan and | ||||
|  * reduces the boundaries between the blocks (generated from the previous | ||||
|  * kernel). and write the data on the temporary buffer. If the second kernel is | ||||
|  * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will | ||||
|  * required, the third and final kernel (ScanAdjustmentKernelFunctor) will | ||||
|  * adjust the final result into the output buffer. | ||||
|  * The original algorithm for the parallel prefix sum can be found here: | ||||
|  * | ||||
|  | ||||
| @ -788,7 +788,7 @@ struct igammac_cf_impl { | ||||
|     Scalar ax = main_igamma_term<Scalar>(a, x); | ||||
|     // This is independent of mode. If this value is zero,
 | ||||
|     // then the function value is zero. If the function value is zero,
 | ||||
|     // then we are in a neighborhood where the function value evalutes to zero,
 | ||||
|     // then we are in a neighborhood where the function value evaluates to zero,
 | ||||
|     // so the derivative is zero.
 | ||||
|     if (ax == zero) { | ||||
|       return zero; | ||||
| @ -899,7 +899,7 @@ struct igamma_series_impl { | ||||
| 
 | ||||
|     // This is independent of mode. If this value is zero,
 | ||||
|     // then the function value is zero. If the function value is zero,
 | ||||
|     // then we are in a neighborhood where the function value evalutes to zero,
 | ||||
|     // then we are in a neighborhood where the function value evaluates to zero,
 | ||||
|     // so the derivative is zero.
 | ||||
|     if (ax == zero) { | ||||
|       return zero; | ||||
|  | ||||
| @ -38,24 +38,24 @@ template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } | ||||
| } | ||||
| } | ||||
| 
 | ||||
| struct EqualAssignement { | ||||
| struct EqualAssignment { | ||||
|   template <typename Lhs, typename Rhs> | ||||
|   void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } | ||||
| }; | ||||
| 
 | ||||
| struct PlusEqualAssignement { | ||||
| struct PlusEqualAssignment { | ||||
|   template <typename Lhs, typename Rhs> | ||||
|   void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } | ||||
| }; | ||||
| 
 | ||||
| template <typename DataType, int DataLayout, | ||||
|           typename Assignement, typename Operator> | ||||
|           typename Assignment, typename Operator> | ||||
| void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, | ||||
|                                     const array<int64_t, 3>& tensor_range) { | ||||
|   Operator op; | ||||
|   Assignement asgn; | ||||
|   Assignment asgn; | ||||
|   { | ||||
|     /* Assignement(out, Operator(in)) */ | ||||
|     /* Assignment(out, Operator(in)) */ | ||||
|     Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); | ||||
|     Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); | ||||
|     in = in.random() + DataType(0.01); | ||||
| @ -84,7 +84,7 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, | ||||
|     sycl_device.deallocate(gpu_data_out); | ||||
|   } | ||||
|   { | ||||
|     /* Assignement(out, Operator(out)) */ | ||||
|     /* Assignment(out, Operator(out)) */ | ||||
|     Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); | ||||
|     out = out.random() + DataType(0.01); | ||||
|     Tensor<DataType, 3, DataLayout, int64_t> reference(out); | ||||
| @ -137,11 +137,11 @@ DECLARE_UNARY_STRUCT(isnan) | ||||
| DECLARE_UNARY_STRUCT(isfinite) | ||||
| DECLARE_UNARY_STRUCT(isinf) | ||||
| 
 | ||||
| template <typename DataType, int DataLayout, typename Assignement> | ||||
| template <typename DataType, int DataLayout, typename Assignment> | ||||
| void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, | ||||
|                                          const array<int64_t, 3>& tensor_range) { | ||||
| #define RUN_UNARY_TEST(FUNC) \ | ||||
|   test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \ | ||||
|   test_unary_builtins_for_scalar<DataType, DataLayout, Assignment, \ | ||||
|                                  op_##FUNC>(sycl_device, tensor_range) | ||||
|   RUN_UNARY_TEST(abs); | ||||
|   RUN_UNARY_TEST(sqrt); | ||||
| @ -190,9 +190,9 @@ template <typename DataType, int DataLayout> | ||||
| void test_unary_builtins(const Eigen::SyclDevice& sycl_device, | ||||
|                          const array<int64_t, 3>& tensor_range) { | ||||
|   test_unary_builtins_for_assignement<DataType, DataLayout, | ||||
|                                       PlusEqualAssignement>(sycl_device, tensor_range); | ||||
|                                       PlusEqualAssignment>(sycl_device, tensor_range); | ||||
|   test_unary_builtins_for_assignement<DataType, DataLayout, | ||||
|                                       EqualAssignement>(sycl_device, tensor_range); | ||||
|                                       EqualAssignment>(sycl_device, tensor_range); | ||||
|   test_unary_builtins_return_bool<DataType, DataLayout, | ||||
|                                   op_isnan>(sycl_device, tensor_range); | ||||
|   test_unary_builtins_return_bool<DataType, DataLayout, | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Kolja Brix
						Kolja Brix