WIP 2 - need to implement 2x1x1
This commit is contained in:
		
							parent
							
								
									029f78abf0
								
							
						
					
					
						commit
						6533187280
					
				| @ -74,6 +74,21 @@ namespace internal { | ||||
|     lhsPackMap.advance(4*1); \ | ||||
|     rhsPackMap.advance(1*4); | ||||
| 
 | ||||
| #define MICRO_2x1x4() \ | ||||
|     pLhs = pload<LhsPacket>(lhsPackMap.pCur); \ | ||||
|     pRhs = pload<RhsPacket>(rhsPackMap.pCur); \ | ||||
|     pRhs0 = pset1<RhsPacket>(pRhs[0]); \ | ||||
|     pRhs1 = pset1<RhsPacket>(pRhs[1]); \ | ||||
|     pRhs = pload<RhsPacket>(rhsPackMap.pCur + 2); \ | ||||
|     pRhs2 = pset1<RhsPacket>(pRhs[0]); \ | ||||
|     pRhs3 = pset1<RhsPacket>(pRhs[1]); \ | ||||
|     acc._acc.packet[0] += pLhs*pRhs0; \ | ||||
|     acc._acc.packet[1] += pLhs*pRhs1; \ | ||||
|     acc._acc.packet[2] += pLhs*pRhs2; \ | ||||
|     acc._acc.packet[3] += pLhs*pRhs3; \ | ||||
|     lhsPackMap.advance(2*1); \ | ||||
|     rhsPackMap.advance(1*4); | ||||
| 
 | ||||
| #define MICRO_12x1x1(K) \ | ||||
|   pLhs = pload<LhsPacket>(lhsPackMap.pCur +  (0 + 3*K)*4); \ | ||||
|   pLhs2 = pload<LhsPacket>(lhsPackMap.pCur + (1 + 3*K)*4); \ | ||||
| @ -116,6 +131,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1> | ||||
|     _acc.packet[2] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
| @ -156,6 +172,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> | ||||
|     _acc.packet[1] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
| @ -189,7 +206,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> | ||||
|   { | ||||
|     _acc = pset1<AccPacket>(0); | ||||
|   } | ||||
|    | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
| @ -221,6 +239,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> | ||||
|     _acc = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
| @ -237,6 +256,80 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| //[TODO] Implement this properly
 | ||||
| template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> | ||||
| struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 2, 4> | ||||
| { | ||||
|   using LinearMapper = typename DataMapper::LinearMapper; | ||||
|   using AccPacket = typename packet_traits<Scalar>::half; | ||||
|   using ResPacket = typename packet_traits<ResScalar>::type; | ||||
| 
 | ||||
|   LinearMapper r0{nullptr}; | ||||
|   LinearMapper r1{nullptr}; | ||||
|   LinearMapper r2{nullptr}; | ||||
|   LinearMapper r3{nullptr}; | ||||
| 
 | ||||
|   PacketBlock<AccPacket, 4> _acc; | ||||
| 
 | ||||
|   EIGEN_STRONG_INLINE void zero() | ||||
|   { | ||||
|     _acc.packet[0] = pset1<AccPacket>(0); | ||||
|     _acc.packet[1] = pset1<AccPacket>(0); | ||||
|     _acc.packet[2] = pset1<AccPacket>(0); | ||||
|     _acc.packet[3] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) | ||||
|   { | ||||
|     asm __volatile__("#BEGIN_PREFETCH_2x4\n\t"); | ||||
|     r0 = dest.getLinearMapper(row + 0, col + 0); | ||||
|     r1 = dest.getLinearMapper(row + 0, col + 1); | ||||
|     r2 = dest.getLinearMapper(row + 0, col + 2); | ||||
|     r3 = dest.getLinearMapper(row + 0, col + 3); | ||||
| 
 | ||||
| #ifdef __ENABLE_PREFETCH__ | ||||
|     r0.prefetch(0); | ||||
|     r1.prefetch(0); | ||||
|     r2.prefetch(0); | ||||
|     r3.prefetch(0); | ||||
| #endif | ||||
|     asm __volatile__("#END_PREFETCH_2x4\n\t"); | ||||
|   } | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
|   EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) | ||||
|   { | ||||
|     // _acc.packet[0] *= pAlpha;
 | ||||
|     // _acc.packet[1] *= pAlpha;
 | ||||
|     // _acc.packet[2] *= pAlpha;
 | ||||
|     // _acc.packet[3] *= pAlpha;
 | ||||
|   } | ||||
| 
 | ||||
|   template<typename ResPacket_> | ||||
|   EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha) | ||||
|   { | ||||
|     asm __volatile__("#BEGIN_STORE_2x4\n\t"); | ||||
|     constexpr auto PacketSize = unpacket_traits<AccPacket>::size; | ||||
|     AccPacket ppAlpha = pset1<AccPacket>(alpha); | ||||
|     AccPacket R00 = r0.template loadPacket<AccPacket>(0*PacketSize); | ||||
|     AccPacket R01 = r1.template loadPacket<AccPacket>(0*PacketSize); | ||||
|     AccPacket R02 = r2.template loadPacket<AccPacket>(0*PacketSize); | ||||
|     AccPacket R03 = r3.template loadPacket<AccPacket>(0*PacketSize); | ||||
| 
 | ||||
|     R00 += ppAlpha*_acc.packet[0]; | ||||
|     R01 += ppAlpha*_acc.packet[1]; | ||||
|     R02 += ppAlpha*_acc.packet[2]; | ||||
|     R03 += ppAlpha*_acc.packet[3]; | ||||
| 
 | ||||
|     r0.storePacket(0*PacketSize, R00); | ||||
|     r1.storePacket(0*PacketSize, R01); | ||||
|     r2.storePacket(0*PacketSize, R02); | ||||
|     r3.storePacket(0*PacketSize, R03); | ||||
|     asm __volatile__("#END_STORE_2x4\n\t"); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> | ||||
| struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> | ||||
| { | ||||
| @ -259,6 +352,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> | ||||
|     _acc.packet[3] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) | ||||
|   { | ||||
|     asm __volatile__("#BEGIN_PREFETCH_4x4\n\t"); | ||||
| @ -336,6 +430,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> | ||||
|     _acc2.packet[3] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) | ||||
|   { | ||||
|     constexpr Index offset = 32 / sizeof(ResScalar); | ||||
| @ -437,6 +532,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4> | ||||
|     _acc3.packet[3] = pset1<AccPacket>(0); | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress, int DepthProgress, int RhsProgress> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) | ||||
|   { | ||||
|     asm __volatile__("#BEGIN_PREFETCH_12x4\n\t"); | ||||
| @ -710,6 +806,28 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, | ||||
|   }; | ||||
| }; | ||||
| 
 | ||||
| template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> | ||||
| struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 2, 1, 4> | ||||
| { | ||||
|   EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,  | ||||
|                                       RhsPackMap& rhsPackMap,  | ||||
|                                       Index rowIdx, Index colIdx, Index depthIdx, | ||||
|                                       Accumulator& acc) | ||||
|   { | ||||
|     using LhsPacket = typename packet_traits<LhsScalar>::half; | ||||
|     using RhsPacket = typename packet_traits<RhsScalar>::half; | ||||
| 
 | ||||
|     asm __volatile__("#BEGIN_NEON_MICROKERNEL_2x1x4\n\t"); | ||||
| 
 | ||||
|     LhsPacket pLhs; | ||||
|     RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; | ||||
| 
 | ||||
|     MICRO_2x1x4(); | ||||
| 
 | ||||
|     asm __volatile__("#END_NEON_MICROKERNEL_2x1x4\n\t"); | ||||
|   }; | ||||
| }; | ||||
| 
 | ||||
| template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> | ||||
| struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1> | ||||
| { | ||||
|  | ||||
| @ -23,7 +23,7 @@ namespace internal { | ||||
| #endif | ||||
| 
 | ||||
| template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar> | ||||
| constexpr int SHAPES_COUNT = 14; | ||||
| constexpr int SHAPES_COUNT = 16; | ||||
| 
 | ||||
| constexpr int SHAPES_DIMENSION = 6; | ||||
| constexpr int SHAPES_LHS_DIMENSION = 0; | ||||
| @ -44,23 +44,35 @@ constexpr int PACK_SHAPES_DIMENSION = 3; | ||||
| constexpr int PACK_SHAPES_POINTER = 2; | ||||
| constexpr int PACK_SHAPES_END = -1; | ||||
| 
 | ||||
| template<typename Scalar> | ||||
| struct PacketMultiples | ||||
| { | ||||
|   enum | ||||
|   { | ||||
|     half = unpacket_traits<typename packet_traits<Scalar>::half>::size, | ||||
|     quarter = unpacket_traits<typename packet_traits<Scalar>::half>::size // Is this used?
 | ||||
|   }; | ||||
| }; | ||||
| 
 | ||||
| // lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
 | ||||
| template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar> | ||||
| constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =  | ||||
|   { /* 00 */{                               1,         1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, | ||||
|     /* 01 */{1*packet_traits<RhsScalar>::size,         1,1,                 0,                  0, SHAPES_POINTER_END}, | ||||
|     /* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  0,                  1}, | ||||
|     /* 03 */{2*packet_traits<RhsScalar>::size,         1,1,                 0,                  2, SHAPES_POINTER_END}, | ||||
|     /* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  2,                  3}, | ||||
|     /* 05 */{3*packet_traits<RhsScalar>::size,         1,1,                 0,                  4, SHAPES_POINTER_END}, | ||||
|     /* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  4,                  5}, | ||||
|     /* 07 */{                               1,         1,4,                 6, SHAPES_POINTER_END, SHAPES_POINTER_END}, | ||||
|     /* 08 */{1*packet_traits<RhsScalar>::size,         1,4,                 6,                  7, SHAPES_POINTER_END}, | ||||
|     /* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4,                 6,                  7,                  8}, | ||||
|     /* 10 */{2*packet_traits<RhsScalar>::size,         1,4,                 6,                  9, SHAPES_POINTER_END}, | ||||
|     /* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4,                 6,                  9,                 10}, | ||||
|     /* 12 */{3*packet_traits<RhsScalar>::size,         1,4,                 6,                 11, SHAPES_POINTER_END}, | ||||
|     /* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4,                 6,                 11,                 12}}; | ||||
|     /* 01 */{PacketMultiples<RhsScalar>::half,         1,1,                 0,                  0, SHAPES_POINTER_END}, | ||||
|     /* 02 */{1*packet_traits<RhsScalar>::size,         1,1,                 0,                  1, SHAPES_POINTER_END}, | ||||
|     /* 03 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  1,                  2}, | ||||
|     /* 04 */{2*packet_traits<RhsScalar>::size,         1,1,                 0,                  3, SHAPES_POINTER_END}, | ||||
|     /* 05 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  3,                  4}, | ||||
|     /* 06 */{3*packet_traits<RhsScalar>::size,         1,1,                 0,                  5, SHAPES_POINTER_END}, | ||||
|     /* 07 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1,                 0,                  5,                  6}, | ||||
|     /* 08 */{                               1,         1,4,                 7, SHAPES_POINTER_END, SHAPES_POINTER_END}, | ||||
|     /* 09 */{PacketMultiples<RhsScalar>::half,         1,4,                 7,                  8, SHAPES_POINTER_END}, | ||||
|     /* 10 */{1*packet_traits<RhsScalar>::size,         1,4,                 7,                  9, SHAPES_POINTER_END}, | ||||
|     /* 11 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4,                 7,                  9,                 10}, | ||||
|     /* 12 */{2*packet_traits<RhsScalar>::size,         1,4,                 7,                 11, SHAPES_POINTER_END}, | ||||
|     /* 13 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4,                 7,                 11,                 12}, | ||||
|     /* 14 */{3*packet_traits<RhsScalar>::size,         1,4,                 7,                 13, SHAPES_POINTER_END}, | ||||
|     /* 15 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4,                 7,                 13,                 14}}; | ||||
| 
 | ||||
| // d1progress x d2progress
 | ||||
| template<int Architecture, int CPU, typename Scalar, bool isLhs> | ||||
| @ -218,6 +230,8 @@ struct PackMap | ||||
|   EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; } | ||||
|   EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); } | ||||
|   EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; } | ||||
| 
 | ||||
|   template<int D1Progress=-1, int D2Progress=-1> | ||||
|   EIGEN_STRONG_INLINE void prefetch(Index amnt) | ||||
|   { | ||||
| #ifdef __ENABLE_PREFETCH__ | ||||
| @ -242,6 +256,7 @@ struct Accumulator | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   template<int LhsProgress=-1, int DepthProgress=-1, int RhsProgress=-1> | ||||
|   EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} | ||||
| 
 | ||||
|   template<typename ResPacket> | ||||
| @ -321,9 +336,9 @@ struct DepthLoopStruct | ||||
|   EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, | ||||
|                           Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) | ||||
|   { | ||||
|     constexpr auto rhsProgress      = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION]; | ||||
|     constexpr auto lhsProgress      = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; | ||||
|     constexpr auto depthProgress    = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION]; | ||||
|     constexpr int rhsProgress      = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION]; | ||||
|     constexpr int lhsProgress      = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; | ||||
|     constexpr int depthProgress    = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION]; | ||||
| 
 | ||||
|     typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType; | ||||
| 
 | ||||
| @ -332,11 +347,10 @@ struct DepthLoopStruct | ||||
| 
 | ||||
|     acc.zero(); | ||||
| 
 | ||||
|     acc.prefetch(res, rowIdx, colIdx); | ||||
|     acc.template prefetch<lhsProgress, depthProgress, rhsProgress>(res, rowIdx, colIdx); | ||||
| 
 | ||||
|     lhsPackMap.prefetch(0); | ||||
|     if(rhsProgress > 1) | ||||
|       rhsPackMap.prefetch(0); | ||||
|     lhsPackMap.template prefetch<lhsProgress, depthProgress>(0); | ||||
|     rhsPackMap.template prefetch<rhsProgress, depthProgress>(0); | ||||
| 
 | ||||
|     for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress) | ||||
|     { | ||||
| @ -367,9 +381,10 @@ struct LhsLoopStruct | ||||
|     constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION]; | ||||
|     constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION]; | ||||
|     DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS; | ||||
|     rhsPackMap.resetCur(); | ||||
|     //rhsPackMap.resetCur();
 | ||||
|     for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress) | ||||
|     { | ||||
|       rhsPackMap.resetCur(); | ||||
|       //lhsPackMap.moveTo(rowIdx);
 | ||||
|       //rhsPackMap.moveTo(colIdx);
 | ||||
| 
 | ||||
|  | ||||
| @ -15,7 +15,8 @@ void set(MatrixXf& A, int m, int n, int id, int digits) | ||||
| int main(int argc, char* argv[]) | ||||
| { | ||||
| #ifdef __DEBUG__ | ||||
|     int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n); | ||||
|     int m = std::atoi(argv[1]), k = std::atoi(argv[1]), n = std::atoi(argv[1]); | ||||
|     int max = std::max(std::max(m,k),n); | ||||
|     MatrixXf A = MatrixXf::Zero(m, k); | ||||
|     MatrixXf B = MatrixXf::Zero(k, n); | ||||
|     MatrixXf C = MatrixXf::Zero(m, n); | ||||
| @ -24,7 +25,8 @@ int main(int argc, char* argv[]) | ||||
|     set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1); | ||||
|     set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1); | ||||
| 
 | ||||
|     C = A*B; | ||||
|     for(auto i = 0; i < 2; i++) | ||||
|         C = A*B; | ||||
| 
 | ||||
|     std::cout << A << std::endl; | ||||
|     std::cout << B << std::endl; | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Everton Constantino
						Everton Constantino