Support multiple contraction kernel types in TensorContractionThreadPool
This commit is contained in:
		
							parent
							
								
									0a3356f4ec
								
							
						
					
					
						commit
						71cd3fbd6a
					
				
							
								
								
									
										18
									
								
								cmake/FindMkldnn.cmake
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								cmake/FindMkldnn.cmake
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| # Intel mkl-dnn support. | ||||
| # Link: https://github.com/intel/mkl-dnn | ||||
| if (MKLDNN) | ||||
|     set(MKLDNN_FIND_QUIETLY TRUE) | ||||
|     set(MKLDNN_INCLUDES ${MKLDNN}/include) | ||||
|     set(MKLDNN_LIBRARIES ${MKLDNN}/lib) | ||||
| endif (MKLDNN) | ||||
| find_path(MKLDNN | ||||
|         NAMES | ||||
|         mkldnn.h | ||||
|         PATHS | ||||
|         $ENV{MKLDNNDIR}/include | ||||
|         ${INCLUDE_INSTALL_DIR} | ||||
|         ) | ||||
| include(FindPackageHandleStandardArgs) | ||||
| find_package_handle_standard_args(MKLDNN DEFAULT_MSG | ||||
|         MKLDNN) | ||||
| mark_as_advanced(MKLDNN) | ||||
| @ -75,6 +75,10 @@ typedef unsigned __int64 uint64_t; | ||||
| #include "libxsmm.h" | ||||
| #endif | ||||
| 
 | ||||
| #if defined(EIGEN_USE_MKLDNN) | ||||
| #include "mkldnn.h" | ||||
| #endif | ||||
| 
 | ||||
| #ifdef EIGEN_USE_THREADS | ||||
| #include "ThreadPool" | ||||
| #endif | ||||
| @ -121,6 +125,7 @@ typedef unsigned __int64 uint64_t; | ||||
| #include "src/Tensor/TensorArgMax.h" | ||||
| #include "src/Tensor/TensorConcatenation.h" | ||||
| #include "src/Tensor/TensorContractionMapper.h" | ||||
| #include "src/Tensor/TensorContractionMkldnn.h" | ||||
| #include "src/Tensor/TensorContractionBlocking.h" | ||||
| #include "src/Tensor/TensorContraction.h" | ||||
| #include "src/Tensor/TensorContractionThreadPool.h" | ||||
|  | ||||
							
								
								
									
										116
									
								
								unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										116
									
								
								unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,116 @@ | ||||
| // This file is part of Eigen, a lightweight C++ template library
 | ||||
| // for linear algebra.
 | ||||
| //
 | ||||
| // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
 | ||||
| //
 | ||||
| // This Source Code Form is subject to the terms of the Mozilla
 | ||||
| // Public License v. 2.0. If a copy of the MPL was not distributed
 | ||||
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | ||||
| #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H | ||||
| #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H | ||||
| 
 | ||||
| #if defined(EIGEN_USE_MKLDNN) | ||||
| // Support for MklDnn sgemm kernel in Tensor contractions:
 | ||||
| //
 | ||||
| // 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
 | ||||
| //    DataMapper (see TensorContractionInputMapper).
 | ||||
| // 2. Invoke gemm kernel with packed blocks (replacement for default
 | ||||
| // gebp_kernel).
 | ||||
| 
 | ||||
| namespace Eigen { | ||||
| namespace internal { | ||||
| 
 | ||||
| template <typename Scalar, typename StorageIndex, typename DataMapper, | ||||
|           int StorageOrder> | ||||
| struct mkldnn_gemm_pack; | ||||
| 
 | ||||
| // mkl_gemm_pack for ColMajor storage order.
 | ||||
| template <typename Scalar, typename StorageIndex, typename DataMapper> | ||||
| struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper, | ||||
|                         /*StorageOrder*/ ColMajor> { | ||||
|   typedef typename internal::packet_traits<Scalar>::type Packet; | ||||
|   typedef typename DataMapper::LinearMapper LinearMapper; | ||||
| 
 | ||||
|   enum { PacketSize = internal::packet_traits<Scalar>::size }; | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   void operator()(Scalar *block, const DataMapper &data_mapper, | ||||
|                   StorageIndex rows, StorageIndex cols) { | ||||
|     const StorageIndex unrolled_rows = | ||||
|         (rows / (4 * PacketSize)) * (4 * PacketSize); | ||||
|     const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize; | ||||
| 
 | ||||
|     for (StorageIndex col = 0; col < cols; ++col) { | ||||
|       LinearMapper lm = data_mapper.getLinearMapper(0, col); | ||||
| 
 | ||||
|       // Give compiler a strong possibility to unroll the loop.
 | ||||
|       for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) { | ||||
|         for (StorageIndex j = 0; j < 4; ++j) { | ||||
|           const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize); | ||||
|           internal::pstoreu(block + j * PacketSize, p); | ||||
|         } | ||||
|         block += 4 * PacketSize; | ||||
|       } | ||||
| 
 | ||||
|       // Process remaining rows with packets.
 | ||||
|       for (StorageIndex i = unrolled_rows; i < vectorized_rows; | ||||
|            i += PacketSize) { | ||||
|         const Packet p = lm.template loadPacket<Packet>(i); | ||||
|         internal::pstoreu(block, p); | ||||
|         block += PacketSize; | ||||
|       } | ||||
| 
 | ||||
|       // Finalize with coefficients.
 | ||||
|       for (StorageIndex i = vectorized_rows; i < rows; ++i) { | ||||
|         *block = lm(i); | ||||
|         ++block; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <typename Scalar, typename StorageIndex, typename OutputMapper, | ||||
|           bool ConjugateLhs = false, bool ConjugateRhs = false> | ||||
| struct mkldnn_gemm_kernel; | ||||
| 
 | ||||
| // mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
 | ||||
| template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs, | ||||
|           bool ConjugateRhs> | ||||
| struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper, | ||||
|                           ConjugateLhs, ConjugateRhs> { | ||||
|   EIGEN_DONT_INLINE | ||||
|   void operator()(const OutputMapper &output, const float *blockA, | ||||
|                   const float *blockB, const StorageIndex rows, | ||||
|                   const StorageIndex depth, const StorageIndex cols, | ||||
|                   float alpha) { | ||||
|     static const int max_index = (std::numeric_limits<int>::max)(); | ||||
| 
 | ||||
|     eigen_assert(max_index > rows); | ||||
|     eigen_assert(max_index > cols); | ||||
|     eigen_assert(max_index > depth); | ||||
|     eigen_assert(max_index > output.stride()); | ||||
| 
 | ||||
|     const int m = static_cast<int>(rows); | ||||
|     const int n = static_cast<int>(cols); | ||||
|     const int k = static_cast<int>(depth); | ||||
| 
 | ||||
|     const char transposeA = ConjugateLhs ? 'Y' : 'N'; | ||||
|     const char transposeB = ConjugateRhs ? 'Y' : 'N'; | ||||
| 
 | ||||
|     const int ldA = ConjugateLhs ? k : m; | ||||
|     const int ldB = ConjugateRhs ? n : k; | ||||
|     const int ldC = static_cast<int>(output.stride()); | ||||
| 
 | ||||
|     const float beta = 1.0; | ||||
| 
 | ||||
|     mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, | ||||
|                                       &alpha, blockA, &ldA, blockB, &ldB, &beta, | ||||
|                                       const_cast<float*>(output.data()), &ldC); | ||||
|     eigen_assert(st == 0); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| }  // namespace internal
 | ||||
| }  // namespace Eigen
 | ||||
| #endif  // EIGEN_USE_MKLDNN
 | ||||
| #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
 | ||||
| @ -15,6 +15,177 @@ | ||||
| 
 | ||||
| namespace Eigen { | ||||
| 
 | ||||
| namespace internal { | ||||
| 
 | ||||
| // WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
 | ||||
| // ColMajor storage order. This property is guaranteed by the
 | ||||
| // TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
 | ||||
| // blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
 | ||||
| // multiplication for these blocks. Default tensor contraction uses
 | ||||
| // gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
 | ||||
| // GeneralBlocPanelKernel.h for details).
 | ||||
| //
 | ||||
| // By specializing contraction kernels we can use other low level libraries to
 | ||||
| // perform matrix multiplication, and still rely on Eigen thread pool evaluator
 | ||||
| // for scaling. Assumption is that custom gemm do not use it's own threading for
 | ||||
| // parallelisation.
 | ||||
| //
 | ||||
| // - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
 | ||||
| //   multiplication, lhs tensor and rhs tensor respectively.
 | ||||
| //
 | ||||
| // - StorageIndex - index type for the tensor expressions. In practice almost
 | ||||
| //   always is Eigen::Index.
 | ||||
| //
 | ||||
| // - OutputMapper provides access to the memory of the output matrix. In
 | ||||
| //   practice it's always column major blas_data_mapper (it must be of ResScalar
 | ||||
| //   type).
 | ||||
| //
 | ||||
| // - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
 | ||||
| //   view into the Lhs/Rhs tensor expressions. In practice it's
 | ||||
| //   TensorContractionInputMapper, or some specialization of it based on the
 | ||||
| //   type of tensor expression (e.g. TensorImagePatchOp has optimized input
 | ||||
| //   mapper).
 | ||||
| //
 | ||||
| // TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
 | ||||
| // evaluator.
 | ||||
| template<typename ResScalar, typename LhsScalar, typename RhsScalar, | ||||
|     typename StorageIndex, typename OutputMapper, typename LhsMapper, | ||||
|     typename RhsMapper> | ||||
| struct TensorContractionKernel { | ||||
|   typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; | ||||
| 
 | ||||
|   typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, | ||||
|                                   typename LhsMapper::SubMapper, | ||||
|                                   Traits::mr, Traits::LhsProgress, | ||||
|                                   typename Traits::LhsPacket4Packing, ColMajor> | ||||
|       LhsPacker; | ||||
| 
 | ||||
|   typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, | ||||
|                                   typename RhsMapper::SubMapper, Traits::nr, | ||||
|                                   ColMajor> | ||||
|       RhsPacker; | ||||
| 
 | ||||
|   typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, | ||||
|                                 OutputMapper, Traits::mr, Traits::nr, | ||||
|       /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> | ||||
|       GebpKernel; | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void packLhs(LhsScalar* lhsBlock, | ||||
|                       const typename LhsMapper::SubMapper& data_mapper, | ||||
|                       const StorageIndex depth, const StorageIndex rows) { | ||||
|     LhsPacker()(lhsBlock, data_mapper, depth, rows); | ||||
|   } | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void packRhs(RhsScalar* rhsBlock, | ||||
|                       const typename RhsMapper::SubMapper& data_mapper, | ||||
|                       const StorageIndex depth, const StorageIndex cols) { | ||||
|     RhsPacker()(rhsBlock, data_mapper, depth, cols); | ||||
|   } | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void invoke(const OutputMapper& output_mapper, | ||||
|                      const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, | ||||
|                      const StorageIndex rows, const StorageIndex depth, | ||||
|                      const StorageIndex cols, const ResScalar alpha) { | ||||
|     GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, | ||||
|         /*strideA*/ -1, /*strideB*/ -1, | ||||
|         /*offsetA*/ 0, /*offsetB*/ 0); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Some tensor contraction kernels might rely on the gemm libraries that are
 | ||||
| // optimized for a specific dimension sizes. By default Eigen picks block
 | ||||
| // sizes to fit the working set in the L1/L2 caches, by specializing we can
 | ||||
| // refine this choice and round up these sizes to work well with underlying gemm
 | ||||
| // library.
 | ||||
| // TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
 | ||||
| template<typename ResScalar, typename LhsScalar, typename RhsScalar, | ||||
|     typename StorageIndex> | ||||
| struct TensorContractionKernelBlocking { | ||||
|   static void refine(const StorageIndex /*m*/, | ||||
|                      const StorageIndex /*n*/, | ||||
|                      const StorageIndex /*k*/, | ||||
|                      StorageIndex* /*bm*/, | ||||
|                      StorageIndex* /*bn*/, | ||||
|                      StorageIndex* /*bk*/) { | ||||
|     // By default we do nothing and stick to the block sizes picked by Eigen.
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| #if defined(EIGEN_USE_MKLDNN) | ||||
| // If all scalar types in tensor contraction are floats, we can use mkldnn gemm
 | ||||
| // as our low level kernel.
 | ||||
| template<typename StorageIndex, typename OutputMapper, typename LhsMapper, | ||||
|     typename RhsMapper> | ||||
| struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper, | ||||
|                                LhsMapper, RhsMapper> { | ||||
|   // For now mkldnn has only mkldnn_sgemm (gemm for floats).
 | ||||
|   typedef float Scalar; | ||||
| 
 | ||||
|   typedef typename internal::gebp_traits<Scalar, Scalar> Traits; | ||||
| 
 | ||||
|   typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, | ||||
|                                      typename LhsMapper::SubMapper, ColMajor> | ||||
|       LhsPacker; | ||||
| 
 | ||||
|   typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex, | ||||
|                                      typename RhsMapper::SubMapper, ColMajor> | ||||
|       RhsPacker; | ||||
| 
 | ||||
|   typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper> | ||||
|       GemmKernel; | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void packLhs(Scalar* lhsBlock, | ||||
|                       const typename LhsMapper::SubMapper& data_mapper, | ||||
|                       StorageIndex depth, StorageIndex rows) { | ||||
|     LhsPacker()(lhsBlock, data_mapper, rows, depth); | ||||
|   } | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void packRhs(Scalar* rhsBlock, | ||||
|                       const typename RhsMapper::SubMapper& data_mapper, | ||||
|                       const StorageIndex depth, const StorageIndex cols) { | ||||
|     RhsPacker()(rhsBlock, data_mapper, depth, cols); | ||||
|   } | ||||
| 
 | ||||
|   EIGEN_DONT_INLINE | ||||
|   static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, | ||||
|                      const Scalar* rhsBlock, const StorageIndex rows, | ||||
|                      const StorageIndex depth, const StorageIndex cols, | ||||
|                      const Scalar alpha) { | ||||
|     GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // For mkldnn_sgemm having the right dimensions (especially for small matrices)
 | ||||
| // is more important than fitting all the working set in L1/L2 caches.
 | ||||
| template<typename StorageIndex> | ||||
| struct TensorContractionKernelBlocking<float, float, float, StorageIndex> { | ||||
|   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
 | ||||
|   static const StorageIndex kUnrollM = 48; | ||||
|   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
 | ||||
|   // number that divides to both of them.
 | ||||
|   static const StorageIndex kUnrollN = 24; | ||||
| 
 | ||||
|   static void refine(const StorageIndex m, | ||||
|                      const StorageIndex n, | ||||
|                      const StorageIndex /*k*/, | ||||
|                      StorageIndex* bm, | ||||
|                      StorageIndex* bn, | ||||
|                      StorageIndex* /*bk*/) { | ||||
|     // TODO(ezhulenev): There is probably a better way to pick block sizes.
 | ||||
|     *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); | ||||
|     *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); | ||||
|     // Stick with default bk.
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| #endif  // EIGEN_USE_MKLDNN
 | ||||
| } // namespace internal
 | ||||
| 
 | ||||
| template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> | ||||
| struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> : | ||||
|     public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > { | ||||
| @ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|       bn = blocking.nc(); | ||||
|       bk = blocking.kc(); | ||||
|     } | ||||
|     // Refine blocking choice to work well with contraction kernel.
 | ||||
|     internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar, | ||||
|                                               Index>::refine(m, n, k, &bm, | ||||
|                                                              &bn, &bk); | ||||
| 
 | ||||
|     // Number of kernels for each dimension.
 | ||||
|     Index nm0 = divup(m, bm); | ||||
| @ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|         contract_t, internal::packet_traits<RhsScalar>::size, | ||||
|         rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> | ||||
|         RhsMapper; | ||||
|     typedef internal::gemm_pack_lhs<LhsScalar, Index, | ||||
|                                     typename LhsMapper::SubMapper, Traits::mr, | ||||
|                                     Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> | ||||
|         LhsPacker; | ||||
|     typedef internal::gemm_pack_rhs< | ||||
|         RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> | ||||
|         RhsPacker; | ||||
| 
 | ||||
|     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; | ||||
|     typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, | ||||
|                                   Traits::mr, Traits::nr, false, false> | ||||
|         GebpKernel; | ||||
| 
 | ||||
|     typedef internal::TensorContractionKernel< | ||||
|         Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> | ||||
|         TensorContractionKernel; | ||||
| 
 | ||||
|     Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, | ||||
|             Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, | ||||
| @ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|     void pack_lhs(Index m, Index k) { | ||||
|       const Index mend = m * gm_ + gm(m); | ||||
|       for (Index m1 = m * gm_; m1 < mend; m1++) | ||||
|         LhsPacker()(packed_lhs_[k % (P - 1)][m1], | ||||
|                     lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); | ||||
|         TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1], | ||||
|                                          lhs_.getSubMapper(m1 * bm_, k * bk_), | ||||
|                                          bk(k), bm(m1)); | ||||
| 
 | ||||
|       if (!parallel_pack_ && shard_by_col_) { | ||||
|         signal_packing(k); | ||||
| @ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|           // deadlocks.
 | ||||
|           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); | ||||
|         } | ||||
|         RhsPacker()(packed_rhs_[k % (P - 1)][n1], | ||||
|                     rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); | ||||
|         TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1], | ||||
|                                          rhs_.getSubMapper(k * bk_, n1 * bn_), | ||||
|                                          bk(k), bn(n1)); | ||||
|       } | ||||
| 
 | ||||
|       if (parallel_pack_ || shard_by_col_) { | ||||
| @ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|         for (Index n1 = n * gn_; n1 < nend; n1++) { | ||||
|           for (Index m1 = m * gm_; m1 < mend; m1++) { | ||||
|             const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); | ||||
|             GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], | ||||
|                          packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), | ||||
|                          Scalar(1), -1, -1, 0, 0); | ||||
|             TensorContractionKernel::invoke( | ||||
|                 output_mapper, packed_lhs_[k % (P - 1)][m1], | ||||
|                 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); | ||||
| 
 | ||||
|             // We are done with the last task for the [m1, n1] block.
 | ||||
|             if (k + 1 == nk_) { | ||||
| @ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT | ||||
|         for (Index m1 = m * gm_; m1 < mend; m1++) | ||||
|           for (Index n1 = n * gn_; n1 < nend; n1++) { | ||||
|             const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); | ||||
|             GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], | ||||
|                          packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), | ||||
|                          Scalar(1), -1, -1, 0, 0); | ||||
|             TensorContractionKernel::invoke( | ||||
|                 output_mapper, packed_lhs_[k % (P - 1)][m1], | ||||
|                 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1)); | ||||
| 
 | ||||
|             // We are done with the last task for the [m1, n1] block.
 | ||||
|             if (k + 1 == nk_) { | ||||
|  | ||||
| @ -23,6 +23,17 @@ else(XSMM_FOUND) | ||||
|   ei_add_property(EIGEN_MISSING_BACKENDS  "Xsmm, ") | ||||
| endif(XSMM_FOUND) | ||||
| 
 | ||||
| find_package(Mkldnn) | ||||
| if(MKLDNN_FOUND) | ||||
|   add_definitions("-DEIGEN_USE_MKLDNN") | ||||
|   include_directories(${MKLDNN_INCLUDES}) | ||||
|   link_directories(${MKLDNN_LIBRARIES}) | ||||
|   set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn) | ||||
|   ei_add_property(EIGEN_TESTED_BACKENDS  "Mkldd, ") | ||||
| else(MKLDNN_FOUND) | ||||
|   ei_add_property(EIGEN_MISSING_BACKENDS  "Mkldnn, ") | ||||
| endif(MKLDNN_FOUND) | ||||
| 
 | ||||
| find_package(GoogleHash) | ||||
| if(GOOGLEHASH_FOUND) | ||||
|   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") | ||||
| @ -190,6 +201,7 @@ if(EIGEN_TEST_CXX11) | ||||
|   ei_add_test(cxx11_tensor_index_list) | ||||
|   ei_add_test(cxx11_tensor_mixed_indices) | ||||
|   ei_add_test(cxx11_tensor_contraction) | ||||
|   ei_add_test(cxx11_tensor_contraction_mkldnn) | ||||
|   ei_add_test(cxx11_tensor_convolution) | ||||
|   ei_add_test(cxx11_tensor_expr) | ||||
|   ei_add_test(cxx11_tensor_fixed_size) | ||||
|  | ||||
							
								
								
									
										141
									
								
								unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,141 @@ | ||||
| // This file is part of Eigen, a lightweight C++ template library
 | ||||
| // for linear algebra.
 | ||||
| //
 | ||||
| // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
 | ||||
| //
 | ||||
| // This Source Code Form is subject to the terms of the Mozilla
 | ||||
| // Public License v. 2.0. If a copy of the MPL was not distributed
 | ||||
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | ||||
| 
 | ||||
| #include "main.h" | ||||
| 
 | ||||
| // Nothing to test here if we do not have mkldnn enabled.
 | ||||
| #if defined(EIGEN_USE_MKLDNN) | ||||
| 
 | ||||
| #include <Eigen/CXX11/Tensor> | ||||
| 
 | ||||
| using Eigen::array; | ||||
| using Eigen::ColMajor; | ||||
| using Eigen::Tensor; | ||||
| using Eigen::Index; | ||||
| using Eigen::internal::blas_data_mapper; | ||||
| using Eigen::internal::mkldnn_gemm_kernel; | ||||
| using Eigen::internal::mkldnn_gemm_pack; | ||||
| 
 | ||||
| template <int NumDims> | ||||
| static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) { | ||||
|   array<Index, NumDims> dims; | ||||
|   for (int i = 0; i < NumDims; ++i) { | ||||
|     dims[i] = internal::random<int>(min_dim, max_dim); | ||||
|   } | ||||
|   return dims; | ||||
| } | ||||
| 
 | ||||
| // Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
 | ||||
| // Tensor.
 | ||||
| template <typename Scalar> | ||||
| static void test_mkldnn_gemm_pack() { | ||||
|   static const int Options = 0 | ColMajor; | ||||
| 
 | ||||
|   typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper; | ||||
|   typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack; | ||||
|   typedef Tensor<Scalar, 2, Options, Index> Tensor2d; | ||||
| 
 | ||||
|   array<Index, 2> dims = RandomDims<2>(1, 500); | ||||
| 
 | ||||
|   // Create a tensor initialized with random data.
 | ||||
|   Tensor2d src(dims); | ||||
|   src.setRandom(); | ||||
| 
 | ||||
|   // Pick a random slice of src tensor.
 | ||||
|   array<Index, 2> slice_start = RandomDims<2>(0, 250); | ||||
|   array<Index, 2> slice_size = RandomDims<2>(100, 500); | ||||
|   // Make sure that slice start + size do not overflow tensor dims.
 | ||||
|   for (int i = 0; i < 2; ++i) { | ||||
|     slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); | ||||
|     slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); | ||||
|   } | ||||
| 
 | ||||
|   // Prepare tensors for packing and slicing results.
 | ||||
|   Tensor2d pack_dst(slice_size[0], slice_size[1]); | ||||
|   Tensor2d slice_dst(slice_size[0], slice_size[1]); | ||||
| 
 | ||||
|   // Pack memory using mkldnn_gemm_pack.
 | ||||
|   DataMapper data_mapper(src.data(), dims[0]); | ||||
|   MkldnnGemmPack gemm_pack; | ||||
|   gemm_pack(pack_dst.data(), | ||||
|             data_mapper.getSubMapper(slice_start[0], slice_start[1]), | ||||
|             slice_size[0], slice_size[1]); | ||||
|   // Slice the source tensor.
 | ||||
|   slice_dst = src.slice(slice_start, slice_size); | ||||
| 
 | ||||
|   // Verify that dst tensors are equal.
 | ||||
|   VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(), | ||||
|                   slice_dst.dimensions().TotalSize()); | ||||
|   for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) { | ||||
|     Scalar packed = pack_dst.coeff(i); | ||||
|     Scalar sliced = slice_dst.coeff(i); | ||||
|     VERIFY_IS_EQUAL(packed, sliced); | ||||
|   } | ||||
| } | ||||
| template <typename Scalar> | ||||
| static void test_mkldnn_gemm_kernel() { | ||||
|   static const int Options = 0 | ColMajor; | ||||
| 
 | ||||
|   typedef Tensor<Scalar, 2, Options, Index> Tensor2d; | ||||
| 
 | ||||
|   int m = internal::random<int>(1, 100); | ||||
|   int n = internal::random<int>(1, 100); | ||||
|   int k = internal::random<int>(1, 100); | ||||
| 
 | ||||
|   Tensor2d lhs(m, k); | ||||
|   lhs.setRandom(); | ||||
| 
 | ||||
|   Tensor2d rhs(k, n); | ||||
|   rhs.setRandom(); | ||||
| 
 | ||||
|   // Compute matmul with mkldnn gemm kernel.
 | ||||
|   typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; | ||||
|   typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor> | ||||
|       MkldnnGemmKernel; | ||||
| 
 | ||||
|   Tensor2d mkldnn_result(m, n); | ||||
|   mkldnn_result.setZero(); | ||||
| 
 | ||||
|   OutputMapper output_mapper(mkldnn_result.data(), m); | ||||
|   MkldnnGemmKernel gemm_kernel; | ||||
|   gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0); | ||||
| 
 | ||||
|   // Compute matmul with Eigen::Matrix.
 | ||||
|   typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix; | ||||
|   typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap; | ||||
| 
 | ||||
|   MatrixMap lhs_mat(lhs.data(), m, k); | ||||
|   MatrixMap rhs_mat(rhs.data(), k, n); | ||||
|   Matrix matmul_result(m, n); | ||||
|   matmul_result.setZero(); | ||||
| 
 | ||||
|   matmul_result = lhs_mat * rhs_mat; | ||||
| 
 | ||||
|   static const float error_threshold = 1e-4f; | ||||
| 
 | ||||
|   // Verify that results are equal.
 | ||||
|   for (Index i = 0; i < m * n; ++i) { | ||||
|     Scalar gemm = mkldnn_result(i); | ||||
|     Scalar matmul = matmul_result(i % m, i / m); | ||||
|     if ((std::abs)(gemm) > error_threshold && | ||||
|         (std::abs)(matmul) > error_threshold) { | ||||
|       if (!Eigen::internal::isApprox(gemm, matmul, error_threshold)) | ||||
|         std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl; | ||||
|       VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold)); | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) { | ||||
|   CALL_SUBTEST(test_mkldnn_gemm_pack<float>()); | ||||
|   CALL_SUBTEST(test_mkldnn_gemm_kernel<float>()); | ||||
| } | ||||
| #else | ||||
| EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {} | ||||
| #endif  // EIGEN_USE_MKLDNN
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Eugene Zhulenev
						Eugene Zhulenev