From a798d076596343b89b18e44c8033d77d22e19892 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Thu, 3 Aug 2023 20:36:42 +0000 Subject: [PATCH] Fix tensor stridedlinearbuffercopy --- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 7e9248796..afa3d5b38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -1054,28 +1054,28 @@ class StridedLinearBufferCopy { } return; } - - const IndexType vectorized_size = count - PacketSize; + + const IndexType vectorized_size = PacketSize * (count / PacketSize); IndexType i = 0; if (kind == StridedLinearBufferCopy::Kind::Linear) { // ******************************************************************** // // Linear copy from `src` to `dst`. - const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); eigen_assert(src_stride == 1 && dst_stride == 1); - for (; i <= unrolled_size; i += 4 * PacketSize) { + for (; i < unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { Packet p = ploadu(src + i + j * PacketSize); pstoreu(dst + i + j * PacketSize, p); } } - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pstoreu(dst + i, p); } if (HasHalfPacket) { - const IndexType vectorized_half_size = count - HalfPacketSize; - if (i <= vectorized_half_size) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { HalfPacket p = ploadu(src + i); pstoreu(dst + i, p); i += HalfPacketSize; @@ -1088,13 +1088,13 @@ class StridedLinearBufferCopy { } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { // Scatter from `src` to `dst`. eigen_assert(src_stride == 1 && dst_stride != 1); - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pscatter(dst + i * dst_stride, p, dst_stride); } if (HasHalfPacket) { - const IndexType vectorized_half_size = count - HalfPacketSize; - if (i <= vectorized_half_size) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { HalfPacket p = ploadu(src + i); pscatter(dst + i * dst_stride, p, dst_stride); i += HalfPacketSize; @@ -1107,20 +1107,21 @@ class StridedLinearBufferCopy { } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { // Fill `dst` with value at `*src`. eigen_assert(src_stride == 0 && dst_stride == 1); - const IndexType unrolled_size = count - 4 * PacketSize; + + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); Scalar s = *src; Packet p = pset1(s); - for (; i <= unrolled_size; i += 4 * PacketSize) { + for (; i < unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { pstoreu(dst + i + j * PacketSize, p); } } - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { pstoreu(dst + i, p); } if (HasHalfPacket) { - const IndexType vectorized_half_size = count - HalfPacketSize; - if (i <= vectorized_half_size) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { HalfPacket hp = pset1(s); pstoreu(dst + i, hp); i += HalfPacketSize; @@ -1135,12 +1136,12 @@ class StridedLinearBufferCopy { eigen_assert(src_stride == 0 && dst_stride != 1); Scalar s = *src; Packet p = pset1(s); - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { pscatter(dst + i * dst_stride, p, dst_stride); } if (HasHalfPacket) { - const IndexType vectorized_half_size = count - HalfPacketSize; - if (i <= vectorized_half_size) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { HalfPacket hp = pset1(s); pscatter(dst + i * dst_stride, hp, dst_stride); i += HalfPacketSize; @@ -1153,13 +1154,13 @@ class StridedLinearBufferCopy { } else if (kind == StridedLinearBufferCopy::Kind::Gather) { // Gather from `src` into `dst`. eigen_assert(dst_stride == 1); - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { Packet p = pgather(src + i * src_stride, src_stride); pstoreu(dst + i, p); } if (HasHalfPacket) { - const IndexType vectorized_half_size = count - HalfPacketSize; - if (i <= vectorized_half_size) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { HalfPacket p = pgather(src + i * src_stride, src_stride); pstoreu(dst + i, p); @@ -1456,11 +1457,11 @@ class TensorBlockAssignment { IndexType eval_offset) { typedef typename packet_traits::type Packet; - const IndexType unrolled_size = count - 4 * PacketSize; - const IndexType vectorized_size = count - PacketSize; + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + const IndexType vectorized_size = PacketSize * (count / PacketSize); IndexType i = 0; - for (; i <= unrolled_size; i += 4 * PacketSize) { + for (; i < unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { const IndexType idx = eval_offset + i + j * PacketSize; Packet p = eval.template packet(idx); @@ -1468,7 +1469,7 @@ class TensorBlockAssignment { } } - for (; i <= vectorized_size; i += PacketSize) { + for (; i < vectorized_size; i += PacketSize) { Packet p = eval.template packet(eval_offset + i); pstoreu(target + i, p); }