Fix arm32 float division and related bugs
This commit is contained in:
		
							parent
							
								
									2873916f1c
								
							
						
					
					
						commit
						81b48065ea
					
				| @ -956,57 +956,6 @@ template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, con | |||||||
|     vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); |     vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) |  | ||||||
| { |  | ||||||
| #if EIGEN_ARCH_ARM64 |  | ||||||
|   return vdiv_f32(a,b); |  | ||||||
| #else |  | ||||||
|   Packet2f inv, restep, div; |  | ||||||
| 
 |  | ||||||
|   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
 |  | ||||||
|   // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
 |  | ||||||
|   // a reciprocal estimate AND a reciprocal step -which saves a few instructions
 |  | ||||||
|   // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
 |  | ||||||
|   // Newton-Raphson and vrecpsq_f32()
 |  | ||||||
|   inv = vrecpe_f32(b); |  | ||||||
| 
 |  | ||||||
|   // This returns a differential, by which we will have to multiply inv to get a better
 |  | ||||||
|   // approximation of 1/b.
 |  | ||||||
|   restep = vrecps_f32(b, inv); |  | ||||||
|   inv = vmul_f32(restep, inv); |  | ||||||
| 
 |  | ||||||
|   // Finally, multiply a by 1/b and get the wanted result of the division.
 |  | ||||||
|   div = vmul_f32(a, inv); |  | ||||||
| 
 |  | ||||||
|   return div; |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) |  | ||||||
| { |  | ||||||
| #if EIGEN_ARCH_ARM64 |  | ||||||
|   return vdivq_f32(a,b); |  | ||||||
| #else |  | ||||||
|   Packet4f inv, restep, div; |  | ||||||
| 
 |  | ||||||
|   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
 |  | ||||||
|   // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
 |  | ||||||
|   // a reciprocal estimate AND a reciprocal step -which saves a few instructions
 |  | ||||||
|   // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
 |  | ||||||
|   // Newton-Raphson and vrecpsq_f32()
 |  | ||||||
|   inv = vrecpeq_f32(b); |  | ||||||
| 
 |  | ||||||
|   // This returns a differential, by which we will have to multiply inv to get a better
 |  | ||||||
|   // approximation of 1/b.
 |  | ||||||
|   restep = vrecpsq_f32(b, inv); |  | ||||||
|   inv = vmulq_f32(restep, inv); |  | ||||||
| 
 |  | ||||||
|   // Finally, multiply a by 1/b and get the wanted result of the division.
 |  | ||||||
|   div = vmulq_f32(a, inv); |  | ||||||
| 
 |  | ||||||
|   return div; |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) | template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) | ||||||
| { | { | ||||||
|   eigen_assert(false && "packet integer division are not supported by NEON"); |   eigen_assert(false && "packet integer division are not supported by NEON"); | ||||||
| @ -3362,26 +3311,115 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { | |||||||
|   return res; |   return res; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) { | ||||||
|  |   // Compute approximate reciprocal sqrt.
 | ||||||
|  |   // Does not correctly handle +/- 0 or +inf
 | ||||||
|  |   float32x4_t result = vrsqrteq_f32(a); | ||||||
|  |   result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result); | ||||||
|  |   result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result); | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) { | ||||||
|  |   // Compute approximate reciprocal sqrt.
 | ||||||
|  |   // Does not correctly handle +/- 0 or +inf
 | ||||||
|  |   float32x2_t result = vrsqrte_f32(a); | ||||||
|  |   result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result); | ||||||
|  |   result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result); | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template<typename Packet> Packet prsqrt_float_common(const Packet& a) { | ||||||
|  |   const Packet cst_zero = pzero(a); | ||||||
|  |   const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity()); | ||||||
|  |   Packet return_zero = pcmp_eq(a, cst_inf); | ||||||
|  |   Packet return_inf = pcmp_eq(a, cst_zero); | ||||||
|  |   Packet result = prsqrt_float_unsafe(a); | ||||||
|  |   result = pselect(return_inf, por(cst_inf, a), result); | ||||||
|  |   result = pandnot(result, return_zero); | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { | template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { | ||||||
|   // Do Newton iterations for 1/sqrt(x).
 |   return prsqrt_float_common(a); | ||||||
|   return generic_rsqrt_newton_step<Packet4f, /*Steps=*/2>::run(a, vrsqrteq_f32(a)); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { | template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { | ||||||
|   // Compute approximate reciprocal sqrt.
 |   return prsqrt_float_common(a); | ||||||
|   return generic_rsqrt_newton_step<Packet2f, /*Steps=*/2>::run(a, vrsqrte_f32(a)); | } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) | ||||||
|  | { | ||||||
|  |   // Compute approximate reciprocal.
 | ||||||
|  |   float32x4_t result = vrecpeq_f32(a); | ||||||
|  |   result = vmulq_f32(vrecpsq_f32(a, result), result); | ||||||
|  |   result = vmulq_f32(vrecpsq_f32(a, result), result); | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) | ||||||
|  | { | ||||||
|  |   // Compute approximate reciprocal.
 | ||||||
|  |   float32x2_t result = vrecpe_f32(a); | ||||||
|  |   result = vmul_f32(vrecps_f32(a, result), result); | ||||||
|  |   result = vmul_f32(vrecps_f32(a, result), result); | ||||||
|  |   return result; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Unfortunately vsqrt_f32 is only available for A64.
 | // Unfortunately vsqrt_f32 is only available for A64.
 | ||||||
| #if EIGEN_ARCH_ARM64 | #if EIGEN_ARCH_ARM64 | ||||||
| template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);} | template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); } | ||||||
| template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); } | ||||||
| #else | #else | ||||||
| template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { | template<typename Packet> | ||||||
|   return generic_sqrt_newton_step<Packet4f>::run(a, prsqrt(a)); | EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) { | ||||||
|  |   const Packet cst_zero = pzero(a); | ||||||
|  |   const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity()); | ||||||
|  |    | ||||||
|  |   Packet result = pmul(a, prsqrt_float_unsafe(a));   | ||||||
|  |   Packet a_is_zero = pcmp_eq(a, cst_zero); | ||||||
|  |   Packet a_is_inf = pcmp_eq(a, cst_inf); | ||||||
|  |   Packet return_a = por(a_is_zero, a_is_inf); | ||||||
|  |    | ||||||
|  |   result = pselect(return_a, a, result); | ||||||
|  |   return result; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { | ||||||
|  |   return psqrt_float_common(a); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { | template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { | ||||||
|   return generic_sqrt_newton_step<Packet2f>::run(a, prsqrt(a)); |   return psqrt_float_common(a); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template<typename Packet> | ||||||
|  | EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) { | ||||||
|  |   // if b is large, NEON intrinsics will flush preciprocal(b) to zero
 | ||||||
|  |   // avoid underflow with the following manipulation:
 | ||||||
|  |   // a / b = f * (a * reciprocal(f * b))
 | ||||||
|  | 
 | ||||||
|  |   const Packet cst_one = pset1<Packet>(1.0f); | ||||||
|  |   const Packet cst_quarter = pset1<Packet>(0.25f); | ||||||
|  |   const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f); | ||||||
|  |    | ||||||
|  |   Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b)); | ||||||
|  |   Packet f = pselect(b_will_underflow, cst_quarter, cst_one); | ||||||
|  |   Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f)))); | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { | ||||||
|  |   return pdiv_float_common(a, b); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) { | ||||||
|  |   return pdiv_float_common(a, b); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -47,7 +47,7 @@ std::vector<Scalar> special_values() { | |||||||
|   const Scalar sqrt2 = Scalar(std::sqrt(2));     |   const Scalar sqrt2 = Scalar(std::sqrt(2));     | ||||||
|   const Scalar inf = Eigen::NumTraits<Scalar>::infinity(); |   const Scalar inf = Eigen::NumTraits<Scalar>::infinity(); | ||||||
|   const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN(); |   const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN(); | ||||||
|   const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min(); |   const Scalar denorm_min = EIGEN_ARCH_ARM ? zero : std::numeric_limits<Scalar>::denorm_min(); | ||||||
|   const Scalar min = (std::numeric_limits<Scalar>::min)(); |   const Scalar min = (std::numeric_limits<Scalar>::min)(); | ||||||
|   const Scalar max = (std::numeric_limits<Scalar>::max)(); |   const Scalar max = (std::numeric_limits<Scalar>::max)(); | ||||||
|   const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps; |   const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps; | ||||||
| @ -97,6 +97,12 @@ void binary_op_test(std::string name, Fn fun, RefFn ref) { | |||||||
|     for (Index j = 0; j < lhs.cols(); ++j) { |     for (Index j = 0; j < lhs.cols(); ++j) { | ||||||
|       Scalar e = static_cast<Scalar>(ref(lhs(i,j), rhs(i,j))); |       Scalar e = static_cast<Scalar>(ref(lhs(i,j), rhs(i,j))); | ||||||
|       Scalar a = actual(i, j); |       Scalar a = actual(i, j); | ||||||
|  |       #if EIGEN_ARCH_ARM | ||||||
|  |       // Work around NEON flush-to-zero mode
 | ||||||
|  |       // if ref returns denormalized value and Eigen returns 0, then skip the test
 | ||||||
|  |       int ref_fpclass = std::fpclassify(e); | ||||||
|  |       if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue; | ||||||
|  |       #endif | ||||||
|       bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); |       bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); | ||||||
|       if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a); |       if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a); | ||||||
|       all_pass &= success; |       all_pass &= success; | ||||||
| @ -767,7 +773,12 @@ template<typename ArrayType> void array_real(const ArrayType& m) | |||||||
|             m3(rows, cols), |             m3(rows, cols), | ||||||
|             m4 = m1; |             m4 = m1; | ||||||
| 
 | 
 | ||||||
|   m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4); |   // avoid denormalized values so verification doesn't fail on platforms that don't support them
 | ||||||
|  |   // denormalized behavior is tested elsewhere (unary_op_test, binary_ops_test)
 | ||||||
|  |   const Scalar min = (std::numeric_limits<Scalar>::min)(); | ||||||
|  |   m1 = (m1.abs()<min).select(Scalar(0),m1); | ||||||
|  |   m2 = (m2.abs()<min).select(Scalar(0),m2); | ||||||
|  |   m4 = (m4.abs()<min).select(Scalar(1),m4); | ||||||
| 
 | 
 | ||||||
|   Scalar  s1 = internal::random<Scalar>(); |   Scalar  s1 = internal::random<Scalar>(); | ||||||
| 
 | 
 | ||||||
| @ -808,6 +819,7 @@ template<typename ArrayType> void array_real(const ArrayType& m) | |||||||
| 
 | 
 | ||||||
|   // avoid inf and NaNs so verification doesn't fail
 |   // avoid inf and NaNs so verification doesn't fail
 | ||||||
|   m3 = m4.abs(); |   m3 = m4.abs(); | ||||||
|  | 
 | ||||||
|   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3))); |   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3))); | ||||||
|   VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3))); |   VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3))); | ||||||
|   VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3))); |   VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3))); | ||||||
|  | |||||||
| @ -754,7 +754,7 @@ void packetmath_test_IEEE_corner_cases(const RefFunctorT& ref_fun, | |||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   // Test for subnormals.
 |   // Test for subnormals.
 | ||||||
|   if (Cond && std::numeric_limits<Scalar>::has_denorm == std::denorm_present) { |   if (Cond && std::numeric_limits<Scalar>::has_denorm == std::denorm_present && !EIGEN_ARCH_ARM) { | ||||||
| 
 | 
 | ||||||
|     for (int scale = 1; scale < 5; ++scale) { |     for (int scale = 1; scale < 5; ++scale) { | ||||||
|       // When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
 |       // When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
 | ||||||
| @ -912,12 +912,14 @@ void packetmath_real() { | |||||||
|   CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); |   CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); | ||||||
|   if (PacketTraits::HasExp) { |   if (PacketTraits::HasExp) { | ||||||
|     // Check denormals:
 |     // Check denormals:
 | ||||||
|  |     #if !EIGEN_ARCH_ARM | ||||||
|     for (int j=0; j<3; ++j) { |     for (int j=0; j<3; ++j) { | ||||||
|       data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j)); |       data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j)); | ||||||
|       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); |       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); | ||||||
|       data1[0] = -data1[0]; |       data1[0] = -data1[0]; | ||||||
|       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); |       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp); | ||||||
|     } |     } | ||||||
|  |     #endif | ||||||
| 
 | 
 | ||||||
|     // zero
 |     // zero
 | ||||||
|     data1[0] = Scalar(0); |     data1[0] = Scalar(0); | ||||||
|  | |||||||
| @ -113,25 +113,6 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c | |||||||
|   res_d = p.inverse()*mat_d; |   res_d = p.inverse()*mat_d; | ||||||
|   VERIFY(res.isApprox(res_d) && "inv(p)*mat"); |   VERIFY(res.isApprox(res_d) && "inv(p)*mat"); | ||||||
| 
 | 
 | ||||||
|   // test non-plaintype expressions that require additional temporary
 |  | ||||||
|   const Scalar alpha(2.34); |  | ||||||
| 
 |  | ||||||
|   res_d = p * (alpha * mat_d); |  | ||||||
|   VERIFY_TEMPORARY_COUNT( res = p * (alpha * mat), 2); |  | ||||||
|   VERIFY( res.isApprox(res_d) && "p*(alpha*mat)" ); |  | ||||||
| 
 |  | ||||||
|   res_d = (alpha * mat_d) * p; |  | ||||||
|   VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p, 2); |  | ||||||
|   VERIFY( res.isApprox(res_d) && "(alpha*mat)*p" ); |  | ||||||
| 
 |  | ||||||
|   res_d = p.inverse() * (alpha * mat_d); |  | ||||||
|   VERIFY_TEMPORARY_COUNT( res = p.inverse() * (alpha * mat), 2); |  | ||||||
|   VERIFY( res.isApprox(res_d) && "inv(p)*(alpha*mat)" ); |  | ||||||
| 
 |  | ||||||
|   res_d = (alpha * mat_d) * p.inverse(); |  | ||||||
|   VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p.inverse(), 2); |  | ||||||
|   VERIFY( res.isApprox(res_d) && "(alpha*mat)*inv(p)" ); |  | ||||||
| 
 |  | ||||||
|   //
 |   //
 | ||||||
| 
 | 
 | ||||||
|   VERIFY( is_sorted( (p * mat * p.inverse()).eval() )); |   VERIFY( is_sorted( (p * mat * p.inverse()).eval() )); | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Charles Schlosser
						Charles Schlosser