From 1e6c6c1576a1bb90ac91df3cf733e46f49e10c8a Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Tue, 11 May 2021 09:52:00 -0700
Subject: [PATCH] Replace memset with fill to work for non-trivial scalars.

For custom scalars, zero is not necessarily represented by
a zeroed-out memory block (e.g. gnu MPFR). We therefore
cannot rely on `memset` if we want to fill a matrix or tensor
with zeroes. Instead, we should rely on `fill`, which for trivial
types does end up getting converted to a `memset` under-the-hood
(at least with gcc/clang).

Requires adding a `fill(begin, end, v)` to `TensorDevice`.

Replaced all potentially bad instances of memset with fill.

Fixes #2245.
---
 Eigen/src/SparseCore/SparseMatrix.h           | 11 +--
 bench/tensors/tensor_benchmarks.h             |  6 +-
 bench/tensors/tensor_contract_sycl_bench.cc   |  6 +-
 test/OffByOneScalar.h                         | 28 +++++++
 .../CXX11/src/Tensor/TensorContraction.h      |  4 +-
 .../CXX11/src/Tensor/TensorContractionGpu.h   |  4 +-
 .../src/Tensor/TensorContractionThreadPool.h  |  4 +-
 .../CXX11/src/Tensor/TensorDeviceDefault.h    | 11 +++
 .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h  | 25 ++++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 76 +++++++++++++++----
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h |  5 ++
 .../src/Tensor/TensorGpuHipCudaDefines.h      |  2 +
 .../src/Tensor/TensorGpuHipCudaUndefines.h    |  1 +
 unsupported/Eigen/src/Skyline/SkylineMatrix.h | 16 ++--
 .../Eigen/src/Skyline/SkylineStorage.h        | 10 +--
 unsupported/test/cxx11_tensor_assign.cpp      | 18 ++---
 unsupported/test/cxx11_tensor_device.cu       | 44 +++++++++++
 unsupported/test/cxx11_tensor_device_sycl.cpp | 19 ++++-
 18 files changed, 229 insertions(+), 61 deletions(-)
 create mode 100644 test/OffByOneScalar.h

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 616b4a0c2..1db906d5d 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -253,9 +253,10 @@ class SparseMatrix
     inline void setZero()
     {
       m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
-      if(m_innerNonZeros)
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
+      if(m_innerNonZeros) {
+        std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+      }
     }
 
     /** Preallocates \a reserveSize non zeros.
@@ -641,7 +642,7 @@ class SparseMatrix
         std::free(m_innerNonZeros);
         m_innerNonZeros = 0;
       }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
+      std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
     }
 
     /** \internal
@@ -1260,7 +1261,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
       m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
       if(!m_innerNonZeros) internal::throw_std_bad_alloc();
       
-      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
       
       // pack all inner-vectors to the end of the pre-allocated space
       // and allocate the entire free-space to the first inner-vector
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 0825e1563..0e8339e15 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -564,9 +564,9 @@ for (int iter = 0; iter < 10; ++iter) {
 
     // Initialize the content of the memory pools to prevent asan from
     // complaining.
-    device_.memset(a_, 12, m_ * k_ * sizeof(T));
-    device_.memset(b_, 23, k_ * n_ * sizeof(T));
-    device_.memset(c_, 31, m_ * n_ * sizeof(T));
+    device_.fill(a_, a_ + m_ * k_, T(12));
+    device_.fill(b_, b_ + k_ * n_, T(23));
+    device_.fill(c_, c_ + m_ * n_, T(31));
 
   }
 
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc
index 8f2defe42..c2d098ecc 100644
--- a/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
 
   // Initialize the content of the memory pools to prevent asan from
   // complaining.
-  device_.memset(a_, 12, m_ * k_ * sizeof(T));
-  device_.memset(b_, 23, k_ * n_ * sizeof(T));
-  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  device_.fill(a_, m_ * k_, T(12));
+  device_.fill(b_, k_ * n_, T(23));
+  device_.fill(c_, m_ * n_, T(31));
 
   Eigen::array<TensorIndex, 2> sizeA;
   sizeA[0] = m_;
diff --git a/test/OffByOneScalar.h b/test/OffByOneScalar.h
new file mode 100644
index 000000000..c0371a6c7
--- /dev/null
+++ b/test/OffByOneScalar.h
@@ -0,0 +1,28 @@
+
+// A Scalar with internal representation T+1 so that zero is internally
+// represented by T(1). This is used to test memory fill.
+// 
+template<typename T>
+class OffByOneScalar {
+ public:
+  OffByOneScalar() : val_(1) {}
+  OffByOneScalar(const OffByOneScalar& other) {
+    *this = other;
+  }
+  OffByOneScalar& operator=(const OffByOneScalar& other) {
+    val_ = other.val_;
+    return *this;
+  }
+  
+  OffByOneScalar(T val) : val_(val + 1) {}
+  OffByOneScalar& operator=(T val) {
+    val_ = val + 1;
+  }
+  
+  operator T() const {
+    return val_ - 1;
+  }
+ 
+ private:
+  T val_;
+};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 8b35f7985..cdd8840fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -762,7 +762,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
     const Index resIncr(1);
 
     // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
-    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+    m_device.fill(buffer, buffer + rows, Scalar(0));
 
     internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
@@ -869,7 +869,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
     // If a contraction kernel does not support beta, explicitly initialize
     // output buffer with zeroes.
     if (!TensorContractionKernel::HasBeta) {
-      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+      this->m_device.fill(buffer, buffer + m * n, Scalar(0));
     }
 
     for(Index i2=0; i2<m; i2+=mc)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index c81803827..1e17e8b5f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -1370,8 +1370,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // columns in right side
     const Index n = this->m_j_size;
 
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
+    this->m_device.fill(buffer, buffer + m * n, Scalar(0));
 
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                    LeftEvaluator, left_nocontract_t,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 21be6ea42..84fac2a9d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -912,9 +912,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
           // x m) row. Safe to do here because all kernels that will write to
           // this memory depend on completion of this task. Note: don't call
-          // device_.memset() here. device_.memset() blocks on thread pool
+          // device_.fill() here. device_.fill() blocks on thread pool
           // worker thread, which can lead to underutilization and deadlocks.
-          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+          std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
         }
         kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
                         rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 46b9d3ab2..5bde4d612 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -39,6 +39,17 @@ struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    // std::fill is not a device function, so resort to simple loop.
+    for (T* it = begin; it != end; ++it) {
+      *it = value;
+    }
+#else
+    std::fill(begin, end, value);
+#endif
+  }
   template<typename Type>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
     return data;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index ec2e3cb14..a9f951836 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -281,10 +281,35 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     gpu_assert(err == gpuSuccess);
 #else
+  EIGEN_UNUSED_VARIABLE(buffer)
+  EIGEN_UNUSED_VARIABLE(c)
+  EIGEN_UNUSED_VARIABLE(n)
   eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
+  template<typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    const size_t count = end - begin;
+    // Split value into bytes and run memset with stride.
+    const int value_size = sizeof(value);
+    char* buffer = (char*)begin;
+    char* value_bytes = (char*)(&value);
+    gpuError_t err;
+    EIGEN_UNUSED_VARIABLE(err)
+    for (int b=0; b<value_size; ++b) {
+      err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    EIGEN_UNUSED_VARIABLE(begin)
+    EIGEN_UNUSED_VARIABLE(end)
+    EIGEN_UNUSED_VARIABLE(value)
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
   EIGEN_STRONG_INLINE size_t numThreads() const {
     // FIXME
     return 32;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index df591c21d..f63d79945 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -327,13 +327,27 @@ class QueueInterface {
     if (n == 0) {
       return;
     }
-    n /= sizeof(buffer_scalar_t);
     auto f = [&](cl::sycl::handler &cgh) {
-      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
-      // The cast to uint8_t is here to match the behaviour of the standard
-      // memset. The cast to buffer_scalar_t is needed to match the type of the
-      // accessor (in case buffer_scalar_t is not uint8_t)
-      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+      // Get a typed range accesser to ensure we fill each byte, in case
+      // `buffer_scalar_t` is not (u)int8_t.
+      auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
+      cgh.fill(dst_acc, static_cast<uint8_t>(c));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
+  }
+
+  template<typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (begin == end) {
+      return;
+    }
+    const ptrdiff_t count = end - begin;
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
+      cgh.fill(dst_acc, value);
     };
     cl::sycl::event e;
     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
@@ -359,6 +373,8 @@ class QueueInterface {
 
     auto original_buffer = pMapper.get_buffer(ptr);
     const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
+    eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
     const ptrdiff_t typed_offset = offset / sizeof(T);
     eigen_assert(typed_offset >= 0);
     const auto typed_size = original_buffer.get_size() / sizeof(T);
@@ -395,6 +411,40 @@ class QueueInterface {
         cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
   }
 
+  /// Get a range accessor to the virtual pointer's device memory with a
+  /// specified type and count.
+  template <cl::sycl::access::mode AcMd, typename T, typename Index>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      T, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+                     const Index count) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    eigen_assert(count >= 0);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    auto buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset >= 0);
+
+    // Technically we should create a subbuffer for the desired range,
+    // then reinterpret that.  However, I was not able to get changes to reflect
+    // in the original buffer (only the subbuffer and reinterpretted buffer).
+    // This current implementation now has the restriction that the buffer
+    // offset and original buffer size must be a multiple of sizeof(T).
+    // Note that get_range_accessor(void*) currently has the same restriction.
+    //
+    // auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer, 
+    //     cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
+    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
+    eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
+    const ptrdiff_t typed_offset = offset / sizeof(T);
+    const size_t typed_size = buffer.get_size() / sizeof(T);
+    auto reint = buffer.template reinterpret<
+        typename Eigen::internal::remove_const<T>::type>(
+        cl::sycl::range<1>(typed_size));
+    return reint.template get_access<AcMd, global_access>(
+        cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
+  }
+
   /// Creation of sycl accessor for a buffer. This function first tries to find
   /// the buffer in the buffer_map. If found it gets the accessor from it, if
   /// not, the function then adds an entry by creating a sycl buffer for that
@@ -912,15 +962,6 @@ struct SyclDevice : public SyclDeviceBase {
     return queue_stream()->get(data);
   }
 
-  /// attach existing buffer
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    return queue_stream()->attach_buffer(buf);
-  }
-  /// detach buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    queue_stream()->detach_buffer(p);
-  }
   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
     return queue_stream()->get_offset(ptr);
   }
@@ -951,6 +992,11 @@ struct SyclDevice : public SyclDeviceBase {
   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
     queue_stream()->memset(data, c, n);
   }
+  /// the fill function
+  template<typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+    queue_stream()->fill(begin, end, value);
+  }
   /// returning the sycl queue
   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
     return queue_stream()->sycl_queue();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index e524b535a..18cc79a0b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -122,6 +122,11 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
+  template<typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+    std::fill(begin, end, value);
+  }
+
   EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index cb53ce298..82ca999b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -41,6 +41,7 @@
 #define gpuMalloc hipMalloc
 #define gpuFree hipFree
 #define gpuMemsetAsync hipMemsetAsync
+#define gpuMemset2DAsync hipMemset2DAsync
 #define gpuMemcpyAsync hipMemcpyAsync
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
@@ -71,6 +72,7 @@
 #define gpuMalloc cudaMalloc
 #define gpuFree cudaFree
 #define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemset2DAsync cudaMemset2DAsync
 #define gpuMemcpyAsync cudaMemcpyAsync
 #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
index 1d142f2ee..e4d4bd5e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -26,6 +26,7 @@
 #undef gpuMalloc
 #undef gpuFree
 #undef gpuMemsetAsync
+#undef gpuMemset2DAsync
 #undef gpuMemcpyAsync
 #undef gpuMemcpyDeviceToDevice
 #undef gpuMemcpyDeviceToHost
diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index 7c7eace7f..664a97f60 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -375,8 +375,8 @@ public:
     /** Removes all non zeros */
     inline void setZero() {
         m_data.clear();
-        memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
-        memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+        std::fill_n(m_colStartIndex, m_outerSize + 1, Index(0));
+        std::fill_n(m_rowStartIndex, m_outerSize + 1, Index(0));
     }
 
     /** \returns the number of non zero coefficients */
@@ -435,7 +435,7 @@ public:
                     }
 
                     //zeros new data
-                    memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_upperPtr() + start, bandIncrement - 1, Scalar(0));
 
                     return m_data.upper(m_colStartIndex[inner]);
                 } else {
@@ -466,7 +466,7 @@ public:
                     }
 
                     //zeros new data
-                    memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_lowerPtr() + start, bandIncrement - 1, Scalar(0));
                     return m_data.lower(m_rowStartIndex[outer]);
                 } else {
                     return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
@@ -493,7 +493,7 @@ public:
                     for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
                         m_rowStartIndex[innerIdx] += bandIncrement;
                     }
-                    memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, bandIncrement - 1, Scalar(0));
                     return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
                 } else {
                     return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
@@ -520,7 +520,7 @@ public:
                     for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
                         m_colStartIndex[innerIdx] += bandIncrement;
                     }
-                    memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, bandIncrement - 1, Scalar(0));
                     return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
                 } else {
                     return m_data.lower(m_colStartIndex[outer] + (inner - outer));
@@ -619,8 +619,8 @@ public:
         m_data.clear();
 
         m_outerSize = diagSize;
-        memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
-        memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
+        std::fill_n(m_colStartIndex, cols + 1, Index(0));
+        std::fill_n(m_rowStartIndex, rows + 1, Index(0));
     }
 
     void resizeNonZeros(Index size) {
diff --git a/unsupported/Eigen/src/Skyline/SkylineStorage.h b/unsupported/Eigen/src/Skyline/SkylineStorage.h
index cc7514f12..9c55f2997 100644
--- a/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h
@@ -187,11 +187,11 @@ public:
     }
 
     inline void reset() {
-        memset(m_diag, 0, m_diagSize * sizeof (Scalar));
-        memset(m_upper, 0, m_upperSize * sizeof (Scalar));
-        memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
-        memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
-        memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
+        std::fill_n(m_diag, m_diagSize, Scalar(0));
+        std::fill_n(m_upper, m_upperSize, Scalar(0));
+        std::fill_n(m_lower, m_lowerSize, Scalar(0));
+        std::fill_n(m_upperProfile, m_diagSize, Index(0));
+        std::fill_n(m_lowerProfile, m_diagSize, Index(0));
     }
 
     void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index ce9d24369..8e3ca0fc0 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -25,10 +25,8 @@ static void test_1d()
   vec1(4) = 23; vec2(4) = 4;
   vec1(5) = 42; vec2(5) = 5;
 
-  int col_major[6];
-  int row_major[6];
-  memset(col_major, 0, 6*sizeof(int));
-  memset(row_major, 0, 6*sizeof(int));
+  int col_major[6] = {0};
+  int row_major[6] = {0};
   TensorMap<Tensor<int, 1> > vec3(col_major, 6);
   TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
 
@@ -88,10 +86,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  int col_major[6];
-  int row_major[6];
-  memset(col_major, 0, 6*sizeof(int));
-  memset(row_major, 0, 6*sizeof(int));
+  int col_major[6] = {0};
+  int row_major[6] = {0};
   TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
   TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
 
@@ -148,10 +144,8 @@ static void test_3d()
     }
   }
 
-  int col_major[2*3*7];
-  int row_major[2*3*7];
-  memset(col_major, 0, 2*3*7*sizeof(int));
-  memset(row_major, 0, 2*3*7*sizeof(int));
+  int col_major[2*3*7] = {0};
+  int row_major[2*3*7] = {0};
   TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
   TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
 
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index c9f78d2d3..58cfc01bf 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -14,6 +14,7 @@
 #define EIGEN_USE_GPU
 
 #include "main.h"
+#include "OffByOneScalar.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
@@ -175,6 +176,44 @@ void test_3d_convolution(Context* context)
   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
 }
 
+// Helper method to synchronize device.
+template<typename Device>
+void synchronize(Device& device) { /*nothing*/ }
+template<>
+void synchronize(Eigen::GpuDevice& device) {
+  device.synchronize();
+}
+
+template <typename DataType, typename TensorDevice>
+void test_device_memory(const TensorDevice& device) {
+  int count = 100;
+  Eigen::array<int, 1> tensorRange = {{count}};
+  Eigen::Tensor<DataType, 1> host(tensorRange);
+  Eigen::Tensor<DataType, 1> expected(tensorRange);
+  DataType* device_data  = static_cast<DataType*>(device.allocate(count * sizeof(DataType)));
+  
+  // memset
+  const char byte_value = static_cast<char>(0xAB);
+  device.memset(device_data, byte_value, count * sizeof(DataType));
+  device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
+  synchronize(device);
+  memset(expected.data(), byte_value, count * sizeof(DataType));
+  for (size_t i=0; i<count; i++) {
+    VERIFY_IS_EQUAL(host(i), expected(i));
+  }
+  
+  // fill
+  DataType fill_value = DataType(7);
+  std::fill_n(expected.data(), count, fill_value);
+  device.fill(device_data, device_data + count, fill_value);
+  device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
+  synchronize(device);
+  for (int i=0; i<count; i++) {
+    VERIFY_IS_EQUAL(host(i), expected(i));
+  }
+  
+  device.deallocate(device_data);
+}
 
 void test_cpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
@@ -266,6 +305,9 @@ void test_cpu() {
       }
     }
   }
+  
+  test_device_memory<float>(context.device());
+  test_device_memory<OffByOneScalar<int>>(context.device());
 }
 
 void test_gpu() {
@@ -386,6 +428,8 @@ void test_gpu() {
 
 #endif
  
+  test_device_memory<float>(context.device());
+  test_device_memory<OffByOneScalar<int>>(context.device());
 }
 
 
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 5095cb078..a9b542c03 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -18,26 +18,36 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
+#include "OffByOneScalar.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <stdint.h>
 #include <iostream>
 
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
-  std::cout << "Running on : "
-            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
-            <<std::endl;
   IndexType sizeDim1 = 100;
   array<IndexType, 1> tensorRange = {{sizeDim1}};
   Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
   Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
-  memset(in1.data(), 1, in1.size() * sizeof(DataType));
   DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  
+  // memset
+  memset(in1.data(), 1, in1.size() * sizeof(DataType));
   sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
   sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
   for (IndexType i=0; i<in.size(); i++) {
     VERIFY_IS_EQUAL(in(i), in1(i));
   }
+  
+  // fill
+  DataType value = DataType(7);
+  std::fill_n(in1.data(), in1.size(), value);
+  sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
+  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+  for (IndexType i=0; i<in.size(); i++) {
+    VERIFY_IS_EQUAL(in(i), in1(i));
+  }
+
   sycl_device.deallocate(gpu_in_data);
 }
 
@@ -73,5 +83,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+    CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
   }
 }