From 1e6c6c1576a1bb90ac91df3cf733e46f49e10c8a Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 11 May 2021 09:52:00 -0700 Subject: [PATCH] Replace memset with fill to work for non-trivial scalars. For custom scalars, zero is not necessarily represented by a zeroed-out memory block (e.g. gnu MPFR). We therefore cannot rely on `memset` if we want to fill a matrix or tensor with zeroes. Instead, we should rely on `fill`, which for trivial types does end up getting converted to a `memset` under-the-hood (at least with gcc/clang). Requires adding a `fill(begin, end, v)` to `TensorDevice`. Replaced all potentially bad instances of memset with fill. Fixes #2245. --- Eigen/src/SparseCore/SparseMatrix.h | 11 +-- bench/tensors/tensor_benchmarks.h | 6 +- bench/tensors/tensor_contract_sycl_bench.cc | 6 +- test/OffByOneScalar.h | 28 +++++++ .../CXX11/src/Tensor/TensorContraction.h | 4 +- .../CXX11/src/Tensor/TensorContractionGpu.h | 4 +- .../src/Tensor/TensorContractionThreadPool.h | 4 +- .../CXX11/src/Tensor/TensorDeviceDefault.h | 11 +++ .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 25 ++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 76 +++++++++++++++---- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 5 ++ .../src/Tensor/TensorGpuHipCudaDefines.h | 2 + .../src/Tensor/TensorGpuHipCudaUndefines.h | 1 + unsupported/Eigen/src/Skyline/SkylineMatrix.h | 16 ++-- .../Eigen/src/Skyline/SkylineStorage.h | 10 +-- unsupported/test/cxx11_tensor_assign.cpp | 18 ++--- unsupported/test/cxx11_tensor_device.cu | 44 +++++++++++ unsupported/test/cxx11_tensor_device_sycl.cpp | 19 ++++- 18 files changed, 229 insertions(+), 61 deletions(-) create mode 100644 test/OffByOneScalar.h diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 616b4a0c2..1db906d5d 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -253,9 +253,10 @@ class SparseMatrix inline void setZero() { m_data.clear(); - memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); - if(m_innerNonZeros) - memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); + std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0)); + if(m_innerNonZeros) { + std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0)); + } } /** Preallocates \a reserveSize non zeros. @@ -641,7 +642,7 @@ class SparseMatrix std::free(m_innerNonZeros); m_innerNonZeros = 0; } - memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); + std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0)); } /** \internal @@ -1260,7 +1261,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca m_innerNonZeros = static_cast(std::malloc(m_outerSize * sizeof(StorageIndex))); if(!m_innerNonZeros) internal::throw_std_bad_alloc(); - memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); + std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0)); // pack all inner-vectors to the end of the pre-allocated space // and allocate the entire free-space to the first inner-vector diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 0825e1563..0e8339e15 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -564,9 +564,9 @@ for (int iter = 0; iter < 10; ++iter) { // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(T)); - device_.memset(b_, 23, k_ * n_ * sizeof(T)); - device_.memset(c_, 31, m_ * n_ * sizeof(T)); + device_.fill(a_, a_ + m_ * k_, T(12)); + device_.fill(b_, b_ + k_ * n_, T(23)); + device_.fill(c_, c_ + m_ * n_, T(31)); } diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc index 8f2defe42..c2d098ecc 100644 --- a/bench/tensors/tensor_contract_sycl_bench.cc +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(T)); - device_.memset(b_, 23, k_ * n_ * sizeof(T)); - device_.memset(c_, 31, m_ * n_ * sizeof(T)); + device_.fill(a_, m_ * k_, T(12)); + device_.fill(b_, k_ * n_, T(23)); + device_.fill(c_, m_ * n_, T(31)); Eigen::array sizeA; sizeA[0] = m_; diff --git a/test/OffByOneScalar.h b/test/OffByOneScalar.h new file mode 100644 index 000000000..c0371a6c7 --- /dev/null +++ b/test/OffByOneScalar.h @@ -0,0 +1,28 @@ + +// A Scalar with internal representation T+1 so that zero is internally +// represented by T(1). This is used to test memory fill. +// +template +class OffByOneScalar { + public: + OffByOneScalar() : val_(1) {} + OffByOneScalar(const OffByOneScalar& other) { + *this = other; + } + OffByOneScalar& operator=(const OffByOneScalar& other) { + val_ = other.val_; + return *this; + } + + OffByOneScalar(T val) : val_(val + 1) {} + OffByOneScalar& operator=(T val) { + val_ = val + 1; + } + + operator T() const { + return val_ - 1; + } + + private: + T val_; +}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8b35f7985..cdd8840fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -762,7 +762,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator const Index resIncr(1); // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) - m_device.memset(buffer, 0, rows * sizeof(Scalar)); + m_device.fill(buffer, buffer + rows, Scalar(0)); internal::general_matrix_vector_product::run( rows, cols, lhs, rhs, @@ -869,7 +869,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator // If a contraction kernel does not support beta, explicitly initialize // output buffer with zeroes. if (!TensorContractionKernel::HasBeta) { - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); } for(Index i2=0; i2m_j_size; - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)) + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); typedef internal::TensorContractionInputMapper + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + // std::fill is not a device function, so resort to simple loop. + for (T* it = begin; it != end; ++it) { + *it = value; + } +#else + std::fill(begin, end, value); +#endif + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { return data; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index ec2e3cb14..a9f951836 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -281,10 +281,35 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) gpu_assert(err == gpuSuccess); #else + EIGEN_UNUSED_VARIABLE(buffer) + EIGEN_UNUSED_VARIABLE(c) + EIGEN_UNUSED_VARIABLE(n) eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + const size_t count = end - begin; + // Split value into bytes and run memset with stride. + const int value_size = sizeof(value); + char* buffer = (char*)begin; + char* value_bytes = (char*)(&value); + gpuError_t err; + EIGEN_UNUSED_VARIABLE(err) + for (int b=0; bstream()); + gpu_assert(err == gpuSuccess); + } +#else + EIGEN_UNUSED_VARIABLE(begin) + EIGEN_UNUSED_VARIABLE(end) + EIGEN_UNUSED_VARIABLE(value) + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + EIGEN_STRONG_INLINE size_t numThreads() const { // FIXME return 32; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index df591c21d..f63d79945 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -327,13 +327,27 @@ class QueueInterface { if (n == 0) { return; } - n /= sizeof(buffer_scalar_t); auto f = [&](cl::sycl::handler &cgh) { - auto dst_acc = get_range_accessor(cgh, data, n); - // The cast to uint8_t is here to match the behaviour of the standard - // memset. The cast to buffer_scalar_t is needed to match the type of the - // accessor (in case buffer_scalar_t is not uint8_t) - cgh.fill(dst_acc, static_cast(static_cast(c))); + // Get a typed range accesser to ensure we fill each byte, in case + // `buffer_scalar_t` is not (u)int8_t. + auto dst_acc = get_typed_range_accessor(cgh, data, n); + cgh.fill(dst_acc, static_cast(c)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (begin == end) { + return; + } + const ptrdiff_t count = end - begin; + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_typed_range_accessor(cgh, begin, count); + cgh.fill(dst_acc, value); }; cl::sycl::event e; EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); @@ -359,6 +373,8 @@ class QueueInterface { auto original_buffer = pMapper.get_buffer(ptr); const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)"); + eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)"); const ptrdiff_t typed_offset = offset / sizeof(T); eigen_assert(typed_offset >= 0); const auto typed_size = original_buffer.get_size() / sizeof(T); @@ -395,6 +411,40 @@ class QueueInterface { cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } + /// Get a range accessor to the virtual pointer's device memory with a + /// specified type and count. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + T, 1, AcMd, cl::sycl::access::target::global_buffer> + get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index count) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(count >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + + // Technically we should create a subbuffer for the desired range, + // then reinterpret that. However, I was not able to get changes to reflect + // in the original buffer (only the subbuffer and reinterpretted buffer). + // This current implementation now has the restriction that the buffer + // offset and original buffer size must be a multiple of sizeof(T). + // Note that get_range_accessor(void*) currently has the same restriction. + // + // auto subbuffer = cl::sycl::buffer(buffer, + // cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes)); + eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)"); + eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)"); + const ptrdiff_t typed_offset = offset / sizeof(T); + const size_t typed_size = buffer.get_size() / sizeof(T); + auto reint = buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + return reint.template get_access( + cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset)); + } + /// Creation of sycl accessor for a buffer. This function first tries to find /// the buffer in the buffer_map. If found it gets the accessor from it, if /// not, the function then adds an entry by creating a sycl buffer for that @@ -912,15 +962,6 @@ struct SyclDevice : public SyclDeviceBase { return queue_stream()->get(data); } - /// attach existing buffer - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer &buf) const { - return queue_stream()->attach_buffer(buf); - } - /// detach buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - queue_stream()->detach_buffer(p); - } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { return queue_stream()->get_offset(ptr); } @@ -951,6 +992,11 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); } + /// the fill function + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + queue_stream()->fill(begin, end, value); + } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index e524b535a..18cc79a0b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -122,6 +122,11 @@ struct ThreadPoolDevice { ::memset(buffer, c, n); } + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + std::fill(begin, end, value); + } + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index cb53ce298..82ca999b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -41,6 +41,7 @@ #define gpuMalloc hipMalloc #define gpuFree hipFree #define gpuMemsetAsync hipMemsetAsync +#define gpuMemset2DAsync hipMemset2DAsync #define gpuMemcpyAsync hipMemcpyAsync #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost @@ -71,6 +72,7 @@ #define gpuMalloc cudaMalloc #define gpuFree cudaFree #define gpuMemsetAsync cudaMemsetAsync +#define gpuMemset2DAsync cudaMemset2DAsync #define gpuMemcpyAsync cudaMemcpyAsync #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h index 1d142f2ee..e4d4bd5e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -26,6 +26,7 @@ #undef gpuMalloc #undef gpuFree #undef gpuMemsetAsync +#undef gpuMemset2DAsync #undef gpuMemcpyAsync #undef gpuMemcpyDeviceToDevice #undef gpuMemcpyDeviceToHost diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h index 7c7eace7f..664a97f60 100644 --- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h +++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h @@ -375,8 +375,8 @@ public: /** Removes all non zeros */ inline void setZero() { m_data.clear(); - memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); - memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); + std::fill_n(m_colStartIndex, m_outerSize + 1, Index(0)); + std::fill_n(m_rowStartIndex, m_outerSize + 1, Index(0)); } /** \returns the number of non zero coefficients */ @@ -435,7 +435,7 @@ public: } //zeros new data - memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); + std::fill_n(this->_upperPtr() + start, bandIncrement - 1, Scalar(0)); return m_data.upper(m_colStartIndex[inner]); } else { @@ -466,7 +466,7 @@ public: } //zeros new data - memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); + std::fill_n(this->_lowerPtr() + start, bandIncrement - 1, Scalar(0)); return m_data.lower(m_rowStartIndex[outer]); } else { return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); @@ -493,7 +493,7 @@ public: for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) { m_rowStartIndex[innerIdx] += bandIncrement; } - memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); + std::fill_n(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, bandIncrement - 1, Scalar(0)); return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner)); } else { return m_data.upper(m_rowStartIndex[inner] + (outer - inner)); @@ -520,7 +520,7 @@ public: for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) { m_colStartIndex[innerIdx] += bandIncrement; } - memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); + std::fill_n(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, bandIncrement - 1, Scalar(0)); return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer)); } else { return m_data.lower(m_colStartIndex[outer] + (inner - outer)); @@ -619,8 +619,8 @@ public: m_data.clear(); m_outerSize = diagSize; - memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index)); - memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index)); + std::fill_n(m_colStartIndex, cols + 1, Index(0)); + std::fill_n(m_rowStartIndex, rows + 1, Index(0)); } void resizeNonZeros(Index size) { diff --git a/unsupported/Eigen/src/Skyline/SkylineStorage.h b/unsupported/Eigen/src/Skyline/SkylineStorage.h index cc7514f12..9c55f2997 100644 --- a/unsupported/Eigen/src/Skyline/SkylineStorage.h +++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h @@ -187,11 +187,11 @@ public: } inline void reset() { - memset(m_diag, 0, m_diagSize * sizeof (Scalar)); - memset(m_upper, 0, m_upperSize * sizeof (Scalar)); - memset(m_lower, 0, m_lowerSize * sizeof (Scalar)); - memset(m_upperProfile, 0, m_diagSize * sizeof (Index)); - memset(m_lowerProfile, 0, m_diagSize * sizeof (Index)); + std::fill_n(m_diag, m_diagSize, Scalar(0)); + std::fill_n(m_upper, m_upperSize, Scalar(0)); + std::fill_n(m_lower, m_lowerSize, Scalar(0)); + std::fill_n(m_upperProfile, m_diagSize, Index(0)); + std::fill_n(m_lowerProfile, m_diagSize, Index(0)); } void prune(Scalar reference, RealScalar epsilon = dummy_precision()) { diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index ce9d24369..8e3ca0fc0 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -25,10 +25,8 @@ static void test_1d() vec1(4) = 23; vec2(4) = 4; vec1(5) = 42; vec2(5) = 5; - int col_major[6]; - int row_major[6]; - memset(col_major, 0, 6*sizeof(int)); - memset(row_major, 0, 6*sizeof(int)); + int col_major[6] = {0}; + int row_major[6] = {0}; TensorMap > vec3(col_major, 6); TensorMap > vec4(row_major, 6); @@ -88,10 +86,8 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; - int col_major[6]; - int row_major[6]; - memset(col_major, 0, 6*sizeof(int)); - memset(row_major, 0, 6*sizeof(int)); + int col_major[6] = {0}; + int row_major[6] = {0}; TensorMap > mat3(row_major, 2, 3); TensorMap > mat4(col_major, 2, 3); @@ -148,10 +144,8 @@ static void test_3d() } } - int col_major[2*3*7]; - int row_major[2*3*7]; - memset(col_major, 0, 2*3*7*sizeof(int)); - memset(row_major, 0, 2*3*7*sizeof(int)); + int col_major[2*3*7] = {0}; + int row_major[2*3*7] = {0}; TensorMap > mat3(col_major, 2, 3, 7); TensorMap > mat4(row_major, 2, 3, 7); diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index c9f78d2d3..58cfc01bf 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -14,6 +14,7 @@ #define EIGEN_USE_GPU #include "main.h" +#include "OffByOneScalar.h" #include #include @@ -175,6 +176,44 @@ void test_3d_convolution(Context* context) context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); } +// Helper method to synchronize device. +template +void synchronize(Device& device) { /*nothing*/ } +template<> +void synchronize(Eigen::GpuDevice& device) { + device.synchronize(); +} + +template +void test_device_memory(const TensorDevice& device) { + int count = 100; + Eigen::array tensorRange = {{count}}; + Eigen::Tensor host(tensorRange); + Eigen::Tensor expected(tensorRange); + DataType* device_data = static_cast(device.allocate(count * sizeof(DataType))); + + // memset + const char byte_value = static_cast(0xAB); + device.memset(device_data, byte_value, count * sizeof(DataType)); + device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType)); + synchronize(device); + memset(expected.data(), byte_value, count * sizeof(DataType)); + for (size_t i=0; i in1(40,50,70); @@ -266,6 +305,9 @@ void test_cpu() { } } } + + test_device_memory(context.device()); + test_device_memory>(context.device()); } void test_gpu() { @@ -386,6 +428,8 @@ void test_gpu() { #endif + test_device_memory(context.device()); + test_device_memory>(context.device()); } diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 5095cb078..a9b542c03 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -18,26 +18,36 @@ #define EIGEN_USE_SYCL #include "main.h" +#include "OffByOneScalar.h" #include #include #include template void test_device_memory(const Eigen::SyclDevice &sycl_device) { - std::cout << "Running on : " - << sycl_device.sycl_queue().get_device(). template get_info() - < tensorRange = {{sizeDim1}}; Tensor in(tensorRange); Tensor in1(tensorRange); - memset(in1.data(), 1, in1.size() * sizeof(DataType)); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(DataType))); + + // memset + memset(in1.data(), 1, in1.size() * sizeof(DataType)); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); for (IndexType i=0; i void sycl_device_test_per_device(const cl::sycl::dev EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_device_test_per_device(device)); + CALL_SUBTEST(sycl_device_test_per_device>(device)); } }