xref: /aosp_15_r20/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli    Codeplay Software Ltd.
5 // Ralph Potter  Codeplay Software Ltd.
6 // Luke Iwanski  Codeplay Software Ltd.
7 // Contact: <[email protected]>
8 // Copyright (C) 2016 Benoit Steiner <[email protected]>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
18 
19 namespace Eigen {
20 
21 namespace TensorSycl {
22 namespace internal {
23 
24 /// Cache all the device information needed
25 struct SyclDeviceInfo {
SyclDeviceInfoSyclDeviceInfo26   SyclDeviceInfo(cl::sycl::queue queue)
27       : local_mem_type(
28             queue.get_device()
29                 .template get_info<cl::sycl::info::device::local_mem_type>()),
30         max_work_item_sizes(
31             queue.get_device()
32                 .template get_info<
33                     cl::sycl::info::device::max_work_item_sizes>()),
34         max_mem_alloc_size(
35             queue.get_device()
36                 .template get_info<
37                     cl::sycl::info::device::max_mem_alloc_size>()),
38         max_compute_units(queue.get_device()
39                               .template get_info<
40                                   cl::sycl::info::device::max_compute_units>()),
41         max_work_group_size(
42             queue.get_device()
43                 .template get_info<
44                     cl::sycl::info::device::max_work_group_size>()),
45         local_mem_size(
46             queue.get_device()
47                 .template get_info<cl::sycl::info::device::local_mem_size>()),
48         platform_name(queue.get_device()
49                           .get_platform()
50                           .template get_info<cl::sycl::info::platform::name>()),
51         device_name(queue.get_device()
52                         .template get_info<cl::sycl::info::device::name>()),
53         device_vendor(
54             queue.get_device()
55                 .template get_info<cl::sycl::info::device::vendor>()) {}
56 
57   cl::sycl::info::local_mem_type local_mem_type;
58   cl::sycl::id<3> max_work_item_sizes;
59   unsigned long max_mem_alloc_size;
60   unsigned long max_compute_units;
61   unsigned long max_work_group_size;
62   size_t local_mem_size;
63   std::string platform_name;
64   std::string device_name;
65   std::string device_vendor;
66 };
67 
68 }  // end namespace internal
69 }  // end namespace TensorSycl
70 
71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
72 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
73 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
74 // TensorFlow via the Eigen SYCL Backend.
75 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
76     -> decltype(cl::sycl::device::get_devices()) {
77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78   return {cl::sycl::device(cl::sycl::default_selector())};
79 #else
80   std::vector<cl::sycl::device> supported_devices;
81   auto platform_list = cl::sycl::platform::get_platforms();
82   for (const auto &platform : platform_list) {
83     auto device_list = platform.get_devices();
84     auto platform_name =
85         platform.template get_info<cl::sycl::info::platform::name>();
86     std::transform(platform_name.begin(), platform_name.end(),
87                    platform_name.begin(), ::tolower);
88     for (const auto &device : device_list) {
89       auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90       std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91       bool unsupported_condition =
92           (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
93            vendor.find("apu") == std::string::npos) ||
94           (platform_name.find("experimental") != std::string::npos) ||
95           device.is_host();
96       if (!unsupported_condition) {
97         supported_devices.push_back(device);
98       }
99     }
100   }
101   return supported_devices;
102 #endif
103 }
104 
105 class QueueInterface {
106  public:
107   /// Creating device by using cl::sycl::selector or cl::sycl::device.
108   template <typename DeviceOrSelector>
109   explicit QueueInterface(
110       const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111       unsigned num_threads = std::thread::hardware_concurrency())
m_queue(dev_or_sel,handler)112       : m_queue(dev_or_sel, handler),
113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114         m_prog(m_queue.get_context(), get_sycl_supported_devices()),
115 #endif
116         m_thread_pool(num_threads),
117         m_device_info(m_queue) {
118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119     m_prog.build_with_kernel_type<DeviceOrSelector>();
120     auto f = [&](cl::sycl::handler &cgh) {
121       cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
122                                         [=]() {})
123     };
124     EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
125 #endif
126   }
127 
128   template <typename DeviceOrSelector>
129   explicit QueueInterface(
130       const DeviceOrSelector &dev_or_sel,
131       unsigned num_threads = std::thread::hardware_concurrency())
132       : QueueInterface(dev_or_sel,
133                        [this](cl::sycl::exception_list l) {
134                          this->exception_caught_ = this->sycl_async_handler(l);
135                        },
136                        num_threads) {}
137 
138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program()139   EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
140 #endif
141 
142   /// Attach an existing buffer to the pointer map, Eigen will not reuse it
attach_buffer(cl::sycl::buffer<buffer_scalar_t,1> & buf)143   EIGEN_STRONG_INLINE void *attach_buffer(
144       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
145     std::lock_guard<std::mutex> lock(pmapper_mutex_);
146     return static_cast<void *>(pMapper.add_pointer(buf));
147   }
148 
149   /// Detach previously attached buffer
detach_buffer(void * p)150   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
151     std::lock_guard<std::mutex> lock(pmapper_mutex_);
152     TensorSycl::internal::SYCLfree<false>(p, pMapper);
153   }
154 
155   /// Allocating device pointer. This pointer is actually an 8 bytes host
156   /// pointer used as key to access the sycl device buffer. The reason is that
157   /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
158   /// expressions. So we create a key pointer to be used in Eigen expression
159   /// construction. When we convert the Eigen construction into the sycl
160   /// construction we use this pointer as a key in our buffer_map and we make
161   /// sure that we dedicate only one buffer only for this pointer. The device
162   /// pointer would be deleted by calling deallocate function.
allocate(size_t num_bytes)163   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
164 #if EIGEN_MAX_ALIGN_BYTES > 0
165     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
166     if (align > 0) {
167       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
168     }
169 #endif
170     std::lock_guard<std::mutex> lock(pmapper_mutex_);
171     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
172   }
173 
allocate_temp(size_t num_bytes)174   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
175 #if EIGEN_MAX_ALIGN_BYTES > 0
176     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
177     if (align > 0) {
178       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
179     }
180 #endif
181     std::lock_guard<std::mutex> lock(pmapper_mutex_);
182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183     if (scratch_buffers.empty()) {
184       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
185       ;
186     } else {
187       for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188         auto buff = pMapper.get_buffer(*it);
189         if (buff.get_size() >= num_bytes) {
190           auto ptr = *it;
191           scratch_buffers.erase(it);
192           return ptr;
193         } else {
194           ++it;
195         }
196       }
197       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
198     }
199 #else
200     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
201 #endif
202   }
203   template <typename data_t>
204   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205       cl::sycl::access::mode::read_write, data_t>
get(data_t * data)206   get(data_t *data) const {
207     return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
208   }
209   template <typename data_t>
get(TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,data_t> data)210   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
211       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
212                                         data_t>
213           data) const {
214     return static_cast<data_t *>(data.get_virtual_pointer());
215   }
216 
deallocate_temp(void * p)217   EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
218     std::lock_guard<std::mutex> lock(pmapper_mutex_);
219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220     scratch_buffers.insert(p);
221 #else
222     TensorSycl::internal::SYCLfree(p, pMapper);
223 #endif
224   }
225   template <cl::sycl::access::mode AcMd, typename T>
deallocate_temp(const TensorSycl::internal::RangeAccess<AcMd,T> & p)226   EIGEN_STRONG_INLINE void deallocate_temp(
227       const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
228     deallocate_temp(p.get_virtual_pointer());
229   }
230 
231   /// This is used to deallocate the device pointer. p is used as a key inside
232   /// the map to find the device buffer and delete it.
deallocate(void * p)233   EIGEN_STRONG_INLINE void deallocate(void *p) const {
234     std::lock_guard<std::mutex> lock(pmapper_mutex_);
235     TensorSycl::internal::SYCLfree(p, pMapper);
236   }
237 
deallocate_all()238   EIGEN_STRONG_INLINE void deallocate_all() const {
239     std::lock_guard<std::mutex> lock(pmapper_mutex_);
240     TensorSycl::internal::SYCLfreeAll(pMapper);
241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242     scratch_buffers.clear();
243 #endif
244   }
245 
246   /// The memcpyHostToDevice is used to copy the data from host to device
247   /// The destination pointer could be deleted before the copy happend which is
248   /// why a callback function is needed. By default if none is provided, the
249   /// function is blocking.
memcpyHostToDevice(void * dst,const void * src,size_t n,std::function<void ()> callback)250   EIGEN_STRONG_INLINE void memcpyHostToDevice(
251       void *dst, const void *src, size_t n,
252       std::function<void()> callback) const {
253     static const auto write_mode = cl::sycl::access::mode::discard_write;
254     static const auto global_access = cl::sycl::access::target::global_buffer;
255     typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
256         write_accessor;
257     if (n == 0) {
258       if (callback) callback();
259       return;
260     }
261     n /= sizeof(buffer_scalar_t);
262     auto f = [&](cl::sycl::handler &cgh) {
263       write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264       buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
265       auto non_deleter = [](buffer_scalar_t const *) {};
266       std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267       cgh.copy(s_ptr, dst_acc);
268     };
269     cl::sycl::event e;
270     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271     synchronize_and_callback(e, callback);
272   }
273 
274   /// The memcpyDeviceToHost is used to copy the data from device to host.
275   /// The source pointer could be deleted before the copy happend which is
276   /// why a callback function is needed. By default if none is provided, the
277   /// function is blocking.
memcpyDeviceToHost(void * dst,const void * src,size_t n,std::function<void ()> callback)278   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
279       void *dst, const void *src, size_t n,
280       std::function<void()> callback) const {
281     static const auto read_mode = cl::sycl::access::mode::read;
282     static const auto global_access = cl::sycl::access::target::global_buffer;
283     typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
284         read_accessor;
285     if (n == 0) {
286       if (callback) callback();
287       return;
288     }
289     n /= sizeof(buffer_scalar_t);
290     auto f = [&](cl::sycl::handler &cgh) {
291       read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292       buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
293       auto non_deleter = [](buffer_scalar_t *) {};
294       std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295       cgh.copy(src_acc, s_ptr);
296     };
297     cl::sycl::event e;
298     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299     synchronize_and_callback(e, callback);
300   }
301 
302   /// The memcpy function.
303   /// No callback is required here as both arguments are on the device
304   /// and SYCL can handle the dependency.
memcpy(void * dst,const void * src,size_t n)305   EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
306     static const auto read_mode = cl::sycl::access::mode::read;
307     static const auto write_mode = cl::sycl::access::mode::discard_write;
308     if (n == 0) {
309       return;
310     }
311     n /= sizeof(buffer_scalar_t);
312     auto f = [&](cl::sycl::handler &cgh) {
313       auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314       auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315       cgh.copy(src_acc, dst_acc);
316     };
317     cl::sycl::event e;
318     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319     async_synchronize(e);
320   }
321 
322   /// the memset function.
323   /// No callback is required here as both arguments are on the device
324   /// and SYCL can handle the dependency.
memset(void * data,int c,size_t n)325   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
326     static const auto write_mode = cl::sycl::access::mode::discard_write;
327     if (n == 0) {
328       return;
329     }
330     n /= sizeof(buffer_scalar_t);
331     auto f = [&](cl::sycl::handler &cgh) {
332       auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
333       // The cast to uint8_t is here to match the behaviour of the standard
334       // memset. The cast to buffer_scalar_t is needed to match the type of the
335       // accessor (in case buffer_scalar_t is not uint8_t)
336       cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
337     };
338     cl::sycl::event e;
339     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340     async_synchronize(e);
341   }
342 
343   /// Get a range accessor to the virtual pointer's device memory. This range
344   /// accessor will allow access to the memory from the pointer to the end of
345   /// the buffer.
346   ///
347   /// NOTE: Inside a kernel the range accessor will always be indexed from the
348   /// start of the buffer, so the offset in the accessor is only used by
349   /// methods like handler::copy and will not be available inside a kernel.
350   template <cl::sycl::access::mode AcMd, typename T>
351   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessor(const void * ptr)352   get_range_accessor(const void *ptr) const {
353     static const auto global_access = cl::sycl::access::target::global_buffer;
354     static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355     typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356     typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
357 
358     std::lock_guard<std::mutex> lock(pmapper_mutex_);
359 
360     auto original_buffer = pMapper.get_buffer(ptr);
361     const ptrdiff_t offset = pMapper.get_offset(ptr);
362     const ptrdiff_t typed_offset = offset / sizeof(T);
363     eigen_assert(typed_offset >= 0);
364     const auto typed_size = original_buffer.get_size() / sizeof(T);
365     auto buffer = original_buffer.template reinterpret<
366         typename Eigen::internal::remove_const<T>::type>(
367         cl::sycl::range<1>(typed_size));
368     const ptrdiff_t size = buffer.get_count() - typed_offset;
369     eigen_assert(size >= 0);
370     typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371                                1, AcMd, global_access, is_place_holder>
372         placeholder_accessor_t;
373     const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
374     return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375                                            cl::sycl::id<1>(typed_offset)),
376                     static_cast<size_t>(typed_offset),
377                     reinterpret_cast<std::intptr_t>(start_ptr));
378   }
379 
380   /// Get a range accessor to the virtual pointer's device memory with a
381   /// specified size.
382   template <cl::sycl::access::mode AcMd, typename Index>
383   EIGEN_STRONG_INLINE cl::sycl::accessor<
384       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_range_accessor(cl::sycl::handler & cgh,const void * ptr,const Index n_bytes)385   get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
386                      const Index n_bytes) const {
387     static const auto global_access = cl::sycl::access::target::global_buffer;
388     eigen_assert(n_bytes >= 0);
389     std::lock_guard<std::mutex> lock(pmapper_mutex_);
390     auto buffer = pMapper.get_buffer(ptr);
391     const ptrdiff_t offset = pMapper.get_offset(ptr);
392     eigen_assert(offset >= 0);
393     eigen_assert(offset + n_bytes <= buffer.get_size());
394     return buffer.template get_access<AcMd, global_access>(
395         cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
396   }
397 
398   /// Creation of sycl accessor for a buffer. This function first tries to find
399   /// the buffer in the buffer_map. If found it gets the accessor from it, if
400   /// not, the function then adds an entry by creating a sycl buffer for that
401   /// particular pointer.
402   template <cl::sycl::access::mode AcMd>
403   EIGEN_STRONG_INLINE cl::sycl::accessor<
404       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler & cgh,const void * ptr)405   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
406     std::lock_guard<std::mutex> lock(pmapper_mutex_);
407     return pMapper.get_buffer(ptr)
408         .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
409             cgh);
410   }
411 
get_sycl_buffer(const void * ptr)412   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413       const void *ptr) const {
414     std::lock_guard<std::mutex> lock(pmapper_mutex_);
415     return pMapper.get_buffer(ptr);
416   }
417 
get_offset(const void * ptr)418   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
419     std::lock_guard<std::mutex> lock(pmapper_mutex_);
420     return pMapper.get_offset(ptr);
421   }
422 
423   template <typename OutScalar, typename sycl_kernel, typename Lhs,
424             typename Rhs, typename OutPtr, typename Range, typename Index,
425             typename... T>
binary_kernel_launcher(const Lhs & lhs,const Rhs & rhs,OutPtr outptr,Range thread_range,Index scratchSize,T...var)426   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
427                                                   const Rhs &rhs, OutPtr outptr,
428                                                   Range thread_range,
429                                                   Index scratchSize,
430                                                   T... var) const {
431     auto kernel_functor = [=](cl::sycl::handler &cgh) {
432       // binding the placeholder accessors to a commandgroup handler
433       lhs.bind(cgh);
434       rhs.bind(cgh);
435       outptr.bind(cgh);
436       typedef cl::sycl::accessor<OutScalar, 1,
437                                  cl::sycl::access::mode::read_write,
438                                  cl::sycl::access::target::local>
439           LocalAccessor;
440 
441       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
442       cgh.parallel_for(
443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444           program().template get_kernel<sycl_kernel>(),
445 #endif
446           thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
447     };
448     cl::sycl::event e;
449     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450     async_synchronize(e);
451   }
452 
453   template <typename OutScalar, typename sycl_kernel, typename InPtr,
454             typename OutPtr, typename Range, typename Index, typename... T>
unary_kernel_launcher(const InPtr & inptr,OutPtr & outptr,Range thread_range,Index scratchSize,T...var)455   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
456                                                  OutPtr &outptr,
457                                                  Range thread_range,
458                                                  Index scratchSize,
459                                                  T... var) const {
460     auto kernel_functor = [=](cl::sycl::handler &cgh) {
461       // binding the placeholder accessors to a commandgroup handler
462       inptr.bind(cgh);
463       outptr.bind(cgh);
464       typedef cl::sycl::accessor<OutScalar, 1,
465                                  cl::sycl::access::mode::read_write,
466                                  cl::sycl::access::target::local>
467           LocalAccessor;
468 
469       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
470       cgh.parallel_for(
471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472           program().template get_kernel<sycl_kernel>(),
473 #endif
474           thread_range, sycl_kernel(scratch, inptr, outptr, var...));
475     };
476     cl::sycl::event e;
477     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478     async_synchronize(e);
479   }
480 
481     template <typename OutScalar, typename sycl_kernel, typename InPtr,
482            typename Range, typename Index, typename... T>
nullary_kernel_launcher(const InPtr & inptr,Range thread_range,Index scratchSize,T...var)483   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
484                                                  Range thread_range,
485                                                  Index scratchSize,
486                                                  T... var) const {
487     auto kernel_functor = [=](cl::sycl::handler &cgh) {
488       // binding the placeholder accessors to a commandgroup handler
489       inptr.bind(cgh);
490       typedef cl::sycl::accessor<OutScalar, 1,
491                                  cl::sycl::access::mode::read_write,
492                                  cl::sycl::access::target::local>
493           LocalAccessor;
494 
495       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
496       cgh.parallel_for(
497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498           program().template get_kernel<sycl_kernel>(),
499 #endif
500           thread_range, sycl_kernel(scratch, inptr, var...));
501     };
502     cl::sycl::event e;
503     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504     async_synchronize(e);
505   }
506 
507 
synchronize()508   EIGEN_STRONG_INLINE void synchronize() const {
509 #ifdef EIGEN_EXCEPTIONS
510     m_queue.wait_and_throw();
511 #else
512     m_queue.wait();
513 #endif
514   }
515 
516 
async_synchronize(cl::sycl::event e)517   EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
518     set_latest_event(e);
519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
520     synchronize();
521 #endif
522   }
523 
524   template <typename Index>
parallel_for_setup(Index n,Index & tileSize,Index & rng,Index & GRange)525   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
526                                               Index &rng, Index &GRange) const {
527     tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528     tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529                                            EIGEN_SYCL_LOCAL_THREAD_DIM1),
530                         static_cast<Index>(tileSize));
531     rng = n;
532     if (rng == 0) rng = static_cast<Index>(1);
533     GRange = rng;
534     if (tileSize > GRange)
535       tileSize = GRange;
536     else if (GRange > tileSize) {
537       Index xMode = static_cast<Index>(GRange % tileSize);
538       if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
539     }
540   }
541 
542   /// This is used to prepare the number of threads and also the number of
543   /// threads per block for sycl kernels
544   template <typename Index>
parallel_for_setup(const std::array<Index,2> & input_dim,cl::sycl::range<2> & global_range,cl::sycl::range<2> & local_range)545   EIGEN_STRONG_INLINE void parallel_for_setup(
546       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547       cl::sycl::range<2> &local_range) const {
548     std::array<Index, 2> input_range = input_dim;
549     Index max_workgroup_Size =
550         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
551     max_workgroup_Size =
552         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
554                  static_cast<Index>(max_workgroup_Size));
555     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
556     local_range[1] =
557         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558     input_range[1] = input_dim[1];
559     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
560     global_range[1] = input_range[1];
561     if (local_range[1] > global_range[1])
562       local_range[1] = global_range[1];
563     else if (global_range[1] > local_range[1]) {
564       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
565       if (xMode != 0)
566         global_range[1] += static_cast<Index>(local_range[1] - xMode);
567     }
568     local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
569     input_range[0] = input_dim[0];
570     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
571     global_range[0] = input_range[0];
572     if (local_range[0] > global_range[0])
573       local_range[0] = global_range[0];
574     else if (global_range[0] > local_range[0]) {
575       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
576       if (xMode != 0)
577         global_range[0] += static_cast<Index>(local_range[0] - xMode);
578     }
579   }
580 
581   /// This is used to prepare the number of threads and also the number of
582   /// threads per block for sycl kernels
583   template <typename Index>
parallel_for_setup(const std::array<Index,3> & input_dim,cl::sycl::range<3> & global_range,cl::sycl::range<3> & local_range)584   EIGEN_STRONG_INLINE void parallel_for_setup(
585       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586       cl::sycl::range<3> &local_range) const {
587     std::array<Index, 3> input_range = input_dim;
588     Index max_workgroup_Size =
589         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
590     max_workgroup_Size =
591         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
593                  static_cast<Index>(max_workgroup_Size));
594     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
595     local_range[2] =
596         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597     input_range[2] = input_dim[2];
598     if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
599     global_range[2] = input_range[2];
600     if (local_range[2] > global_range[2])
601       local_range[2] = global_range[2];
602     else if (global_range[2] > local_range[2]) {
603       Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
604       if (xMode != 0)
605         global_range[2] += static_cast<Index>(local_range[2] - xMode);
606     }
607     pow_of_2 = static_cast<Index>(
608         std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
609     local_range[1] =
610         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611     input_range[1] = input_dim[1];
612     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
613     global_range[1] = input_range[1];
614     if (local_range[1] > global_range[1])
615       local_range[1] = global_range[1];
616     else if (global_range[1] > local_range[1]) {
617       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
618       if (xMode != 0)
619         global_range[1] += static_cast<Index>(local_range[1] - xMode);
620     }
621     local_range[0] = static_cast<Index>(max_workgroup_Size /
622                                         (local_range[1] * local_range[2]));
623     input_range[0] = input_dim[0];
624     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
625     global_range[0] = input_range[0];
626     if (local_range[0] > global_range[0])
627       local_range[0] = global_range[0];
628     else if (global_range[0] > local_range[0]) {
629       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
630       if (xMode != 0)
631         global_range[0] += static_cast<Index>(local_range[0] - xMode);
632     }
633   }
634 
has_local_memory()635   EIGEN_STRONG_INLINE bool has_local_memory() const {
636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
637     return false;
638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
639     return true;
640 #else
641     return m_device_info.local_mem_type ==
642            cl::sycl::info::local_mem_type::local;
643 #endif
644   }
645 
max_buffer_size()646   EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
647     return m_device_info.max_mem_alloc_size;
648   }
649 
getNumSyclMultiProcessors()650   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
651     return m_device_info.max_compute_units;
652   }
653 
maxSyclThreadsPerBlock()654   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
655     return m_device_info.max_work_group_size;
656   }
657 
maxWorkItemSizes()658   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
659     return m_device_info.max_work_item_sizes;
660   }
661 
662   /// No need for sycl it should act the same as CPU version
majorDeviceVersion()663   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
664 
maxSyclThreadsPerMultiProcessor()665   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
666     // OpenCL doesnot have such concept
667     return 2;
668   }
669 
sharedMemPerBlock()670   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
671     return m_device_info.local_mem_size;
672   }
673 
674   // This function returns the nearest power of 2 Work-group size which is <=
675   // maximum device workgroup size.
getNearestPowerOfTwoWorkGroupSize()676   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
677     return getPowerOfTwo(m_device_info.max_work_group_size, false);
678   }
679 
getPlatformName()680   EIGEN_STRONG_INLINE std::string getPlatformName() const {
681     return m_device_info.platform_name;
682   }
683 
getDeviceName()684   EIGEN_STRONG_INLINE std::string getDeviceName() const {
685     return m_device_info.device_name;
686   }
687 
getDeviceVendor()688   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
689     return m_device_info.device_vendor;
690   }
691 
692   // This function returns the nearest power of 2
693   // if roundup is true returns result>=wgsize
694   // else it return result <= wgsize
getPowerOfTwo(size_t wGSize,bool roundUp)695   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
696     if (roundUp) --wGSize;
697     wGSize |= (wGSize >> 1);
698     wGSize |= (wGSize >> 2);
699     wGSize |= (wGSize >> 4);
700     wGSize |= (wGSize >> 8);
701     wGSize |= (wGSize >> 16);
702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703     wGSize |= (wGSize >> 32);
704 #endif
705     return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
706   }
707 
sycl_queue()708   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
709 
710   // This function checks if the runtime recorded an error for the
711   // underlying stream device.
ok()712   EIGEN_STRONG_INLINE bool ok() const {
713     if (!exception_caught_) {
714       synchronize();
715     }
716     return !exception_caught_;
717   }
718 
get_latest_event()719   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721     std::lock_guard<std::mutex> lock(event_mutex_);
722     return latest_events_[std::this_thread::get_id()];
723 #else
724     eigen_assert(false);
725     return cl::sycl::event();
726 #endif
727   }
728 
729   // destructor
~QueueInterface()730   ~QueueInterface() {
731     pMapper.clear();
732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733     scratch_buffers.clear();
734 #endif
735   }
736 
737  protected:
set_latest_event(cl::sycl::event e)738   EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740     std::lock_guard<std::mutex> lock(event_mutex_);
741     latest_events_[std::this_thread::get_id()] = e;
742 #else
743     EIGEN_UNUSED_VARIABLE(e);
744 #endif
745   }
746 
synchronize_and_callback(cl::sycl::event e,const std::function<void ()> & callback)747   void synchronize_and_callback(cl::sycl::event e,
748                                 const std::function<void()> &callback) const {
749     set_latest_event(e);
750     if (callback) {
751       auto callback_ = [=]() {
752 #ifdef EIGEN_EXCEPTIONS
753         cl::sycl::event(e).wait_and_throw();
754 #else
755         cl::sycl::event(e).wait();
756 #endif
757         callback();
758       };
759       m_thread_pool.Schedule(std::move(callback_));
760     } else {
761 #ifdef EIGEN_EXCEPTIONS
762       m_queue.wait_and_throw();
763 #else
764       m_queue.wait();
765 #endif
766     }
767   }
768 
sycl_async_handler(cl::sycl::exception_list exceptions)769   bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
770     bool exception_caught = false;
771     for (const auto &e : exceptions) {
772       if (e) {
773         exception_caught = true;
774         EIGEN_THROW_X(e);
775       }
776     }
777     return exception_caught;
778   }
779 
780   /// class members:
781   bool exception_caught_ = false;
782 
783   mutable std::mutex pmapper_mutex_;
784 
785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786   mutable std::mutex event_mutex_;
787   mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
788 #endif
789 
790   /// std::map is the container used to make sure that we create only one buffer
791   /// per pointer. The lifespan of the buffer now depends on the lifespan of
792   /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
793   /// host we should manually deallocate it.
794   mutable TensorSycl::internal::PointerMapper pMapper;
795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796   mutable std::unordered_set<void *> scratch_buffers;
797 #endif
798   /// sycl queue
799   mutable cl::sycl::queue m_queue;
800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801   mutable cl::sycl::program m_prog;
802 #endif
803 
804   /// The thread pool is used to wait on events and call callbacks
805   /// asynchronously
806   mutable Eigen::ThreadPool m_thread_pool;
807 
808   const TensorSycl::internal::SyclDeviceInfo m_device_info;
809 };
810 
811 struct SyclDeviceBase {
812   /// QueueInterface is not owned. it is the caller's responsibility to destroy
813   /// it
814   const QueueInterface *m_queue_stream;
SyclDeviceBaseSyclDeviceBase815   explicit SyclDeviceBase(const QueueInterface *queue_stream)
816       : m_queue_stream(queue_stream) {}
queue_streamSyclDeviceBase817   EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
818     return m_queue_stream;
819   }
820 };
821 
822 // Here is a sycl device struct which accept the sycl queue interface
823 // as an input
824 struct SyclDevice : public SyclDeviceBase {
SyclDeviceSyclDevice825   explicit SyclDevice(const QueueInterface *queue_stream)
826       : SyclDeviceBase(queue_stream) {}
827 
828   // this is the accessor used to construct the evaluator
829   template <cl::sycl::access::mode AcMd, typename T>
830   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessorSyclDevice831   get_range_accessor(const void *ptr) const {
832     return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
833   }
834 
835   // get sycl accessor
836   template <cl::sycl::access::mode AcMd>
837   EIGEN_STRONG_INLINE cl::sycl::accessor<
838       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessorSyclDevice839   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
840     return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
841   }
842 
843   /// Accessing the created sycl device buffer for the device pointer
get_sycl_bufferSyclDevice844   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845       const void *ptr) const {
846     return queue_stream()->get_sycl_buffer(ptr);
847   }
848 
849   /// This is used to prepare the number of threads and also the number of
850   /// threads per block for sycl kernels
851   template <typename Index>
parallel_for_setupSyclDevice852   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
853                                               Index &rng, Index &GRange) const {
854     queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
855   }
856 
857   /// This is used to prepare the number of threads and also the number of
858   /// threads per block for sycl kernels
859   template <typename Index>
parallel_for_setupSyclDevice860   EIGEN_STRONG_INLINE void parallel_for_setup(
861       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862       cl::sycl::range<2> &local_range) const {
863     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
864   }
865 
866   /// This is used to prepare the number of threads and also the number of
867   /// threads per block for sycl kernels
868   template <typename Index>
parallel_for_setupSyclDevice869   EIGEN_STRONG_INLINE void parallel_for_setup(
870       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871       cl::sycl::range<3> &local_range) const {
872     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
873   }
874 
875   /// allocate device memory
allocateSyclDevice876   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
877     return queue_stream()->allocate(num_bytes);
878   }
879 
allocate_tempSyclDevice880   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
881     return queue_stream()->allocate_temp(num_bytes);
882   }
883 
884   /// deallocate device memory
deallocateSyclDevice885   EIGEN_STRONG_INLINE void deallocate(void *p) const {
886     queue_stream()->deallocate(p);
887   }
888 
deallocate_tempSyclDevice889   EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
890     queue_stream()->deallocate_temp(buffer);
891   }
892   template <cl::sycl::access::mode AcMd, typename T>
deallocate_tempSyclDevice893   EIGEN_STRONG_INLINE void deallocate_temp(
894       const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
895     queue_stream()->deallocate_temp(buffer);
896   }
deallocate_allSyclDevice897   EIGEN_STRONG_INLINE void deallocate_all() const {
898     queue_stream()->deallocate_all();
899   }
900 
901   template <typename data_t>
902   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903       cl::sycl::access::mode::read_write, data_t>
getSyclDevice904   get(data_t *data) const {
905     return queue_stream()->get(data);
906   }
907   template <typename data_t>
getSyclDevice908   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
909       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
910                                         data_t>
911           data) const {
912     return queue_stream()->get(data);
913   }
914 
915   /// attach existing buffer
attach_bufferSyclDevice916   EIGEN_STRONG_INLINE void *attach_buffer(
917       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
918     return queue_stream()->attach_buffer(buf);
919   }
920   /// detach buffer
detach_bufferSyclDevice921   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
922     queue_stream()->detach_buffer(p);
923   }
get_offsetSyclDevice924   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
925     return queue_stream()->get_offset(ptr);
926   }
927 
928   // some runtime conditions that can be applied here
isDeviceSuitableSyclDevice929   EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
930 
931   /// memcpyHostToDevice
932   template <typename Index>
933   EIGEN_STRONG_INLINE void memcpyHostToDevice(
934       Index *dst, const Index *src, size_t n,
935       std::function<void()> callback = {}) const {
936     queue_stream()->memcpyHostToDevice(dst, src, n, callback);
937   }
938   /// memcpyDeviceToHost
939   template <typename Index>
940   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
941       void *dst, const Index *src, size_t n,
942       std::function<void()> callback = {}) const {
943     queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
944   }
945   /// the memcpy function
946   template <typename Index>
memcpySyclDevice947   EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
948     queue_stream()->memcpy(dst, src, n);
949   }
950   /// the memset function
memsetSyclDevice951   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
952     queue_stream()->memset(data, c, n);
953   }
954   /// returning the sycl queue
sycl_queueSyclDevice955   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
956     return queue_stream()->sycl_queue();
957   }
958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
programSyclDevice959   EIGEN_STRONG_INLINE cl::sycl::program &program() const {
960     return queue_stream()->program();
961   }
962 #endif
963 
firstLevelCacheSizeSyclDevice964   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
965 
lastLevelCacheSizeSyclDevice966   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
967     // We won't try to take advantage of the l2 cache for the time being, and
968     // there is no l3 cache on sycl devices.
969     return firstLevelCacheSize();
970   }
getNumSyclMultiProcessorsSyclDevice971   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
972     return queue_stream()->getNumSyclMultiProcessors();
973   }
maxSyclThreadsPerBlockSyclDevice974   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
975     return queue_stream()->maxSyclThreadsPerBlock();
976   }
maxWorkItemSizesSyclDevice977   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
978     return queue_stream()->maxWorkItemSizes();
979   }
maxSyclThreadsPerMultiProcessorSyclDevice980   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
981     // OpenCL doesnot have such concept
982     return queue_stream()->maxSyclThreadsPerMultiProcessor();
983   }
sharedMemPerBlockSyclDevice984   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
985     return queue_stream()->sharedMemPerBlock();
986   }
getNearestPowerOfTwoWorkGroupSizeSyclDevice987   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
988     return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
989   }
990 
getPowerOfTwoSyclDevice991   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
992     return queue_stream()->getPowerOfTwo(val, roundUp);
993   }
994   /// No need for sycl it should act the same as CPU version
majorDeviceVersionSyclDevice995   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
996     return queue_stream()->majorDeviceVersion();
997   }
998 
synchronizeSyclDevice999   EIGEN_STRONG_INLINE void synchronize() const {
1000     queue_stream()->synchronize();
1001   }
1002   EIGEN_STRONG_INLINE void async_synchronize(
1003       cl::sycl::event e = cl::sycl::event()) const {
1004     queue_stream()->async_synchronize(e);
1005   }
get_latest_eventSyclDevice1006   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007     return queue_stream()->get_latest_event();
1008   }
1009 
1010   // This function checks if the runtime recorded an error for the
1011   // underlying stream device.
okSyclDevice1012   EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013 
has_local_memorySyclDevice1014   EIGEN_STRONG_INLINE bool has_local_memory() const {
1015     return queue_stream()->has_local_memory();
1016   }
max_buffer_sizeSyclDevice1017   EIGEN_STRONG_INLINE long max_buffer_size() const {
1018     return queue_stream()->max_buffer_size();
1019   }
getPlatformNameSyclDevice1020   EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021     return queue_stream()->getPlatformName();
1022   }
getDeviceNameSyclDevice1023   EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024     return queue_stream()->getDeviceName();
1025   }
getDeviceVendorSyclDevice1026   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027     return queue_stream()->getDeviceVendor();
1028   }
1029   template <typename OutScalar, typename KernelType, typename... T>
binary_kernel_launcherSyclDevice1030   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031     queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032         var...);
1033   }
1034   template <typename OutScalar, typename KernelType, typename... T>
unary_kernel_launcherSyclDevice1035   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036     queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037         var...);
1038   }
1039 
1040   template <typename OutScalar, typename KernelType, typename... T>
nullary_kernel_launcherSyclDevice1041   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042     queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043         var...);
1044   }
1045 };
1046 }  // end namespace Eigen
1047 
1048 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
1049