1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Mehdi Goli Codeplay Software Ltd. 5 // Ralph Potter Codeplay Software Ltd. 6 // Luke Iwanski Codeplay Software Ltd. 7 // Contact: <[email protected]> 8 // Copyright (C) 2016 Benoit Steiner <[email protected]> 9 10 // 11 // This Source Code Form is subject to the terms of the Mozilla 12 // Public License v. 2.0. If a copy of the MPL was not distributed 13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 14 15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) 16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H 17 #include <unordered_set> 18 19 namespace Eigen { 20 21 namespace TensorSycl { 22 namespace internal { 23 24 /// Cache all the device information needed 25 struct SyclDeviceInfo { SyclDeviceInfoSyclDeviceInfo26 SyclDeviceInfo(cl::sycl::queue queue) 27 : local_mem_type( 28 queue.get_device() 29 .template get_info<cl::sycl::info::device::local_mem_type>()), 30 max_work_item_sizes( 31 queue.get_device() 32 .template get_info< 33 cl::sycl::info::device::max_work_item_sizes>()), 34 max_mem_alloc_size( 35 queue.get_device() 36 .template get_info< 37 cl::sycl::info::device::max_mem_alloc_size>()), 38 max_compute_units(queue.get_device() 39 .template get_info< 40 cl::sycl::info::device::max_compute_units>()), 41 max_work_group_size( 42 queue.get_device() 43 .template get_info< 44 cl::sycl::info::device::max_work_group_size>()), 45 local_mem_size( 46 queue.get_device() 47 .template get_info<cl::sycl::info::device::local_mem_size>()), 48 platform_name(queue.get_device() 49 .get_platform() 50 .template get_info<cl::sycl::info::platform::name>()), 51 device_name(queue.get_device() 52 .template get_info<cl::sycl::info::device::name>()), 53 device_vendor( 54 queue.get_device() 55 .template get_info<cl::sycl::info::device::vendor>()) {} 56 57 cl::sycl::info::local_mem_type local_mem_type; 58 cl::sycl::id<3> max_work_item_sizes; 59 unsigned long max_mem_alloc_size; 60 unsigned long max_compute_units; 61 unsigned long max_work_group_size; 62 size_t local_mem_size; 63 std::string platform_name; 64 std::string device_name; 65 std::string device_vendor; 66 }; 67 68 } // end namespace internal 69 } // end namespace TensorSycl 70 71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; 72 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and 73 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently 74 // TensorFlow via the Eigen SYCL Backend. 75 EIGEN_STRONG_INLINE auto get_sycl_supported_devices() 76 -> decltype(cl::sycl::device::get_devices()) { 77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR 78 return {cl::sycl::device(cl::sycl::default_selector())}; 79 #else 80 std::vector<cl::sycl::device> supported_devices; 81 auto platform_list = cl::sycl::platform::get_platforms(); 82 for (const auto &platform : platform_list) { 83 auto device_list = platform.get_devices(); 84 auto platform_name = 85 platform.template get_info<cl::sycl::info::platform::name>(); 86 std::transform(platform_name.begin(), platform_name.end(), 87 platform_name.begin(), ::tolower); 88 for (const auto &device : device_list) { 89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>(); 90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); 91 bool unsupported_condition = 92 (device.is_cpu() && platform_name.find("amd") != std::string::npos && 93 vendor.find("apu") == std::string::npos) || 94 (platform_name.find("experimental") != std::string::npos) || 95 device.is_host(); 96 if (!unsupported_condition) { 97 supported_devices.push_back(device); 98 } 99 } 100 } 101 return supported_devices; 102 #endif 103 } 104 105 class QueueInterface { 106 public: 107 /// Creating device by using cl::sycl::selector or cl::sycl::device. 108 template <typename DeviceOrSelector> 109 explicit QueueInterface( 110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, 111 unsigned num_threads = std::thread::hardware_concurrency()) m_queue(dev_or_sel,handler)112 : m_queue(dev_or_sel, handler), 113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 114 m_prog(m_queue.get_context(), get_sycl_supported_devices()), 115 #endif 116 m_thread_pool(num_threads), 117 m_device_info(m_queue) { 118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 119 m_prog.build_with_kernel_type<DeviceOrSelector>(); 120 auto f = [&](cl::sycl::handler &cgh) { 121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(), 122 [=]() {}) 123 }; 124 EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); 125 #endif 126 } 127 128 template <typename DeviceOrSelector> 129 explicit QueueInterface( 130 const DeviceOrSelector &dev_or_sel, 131 unsigned num_threads = std::thread::hardware_concurrency()) 132 : QueueInterface(dev_or_sel, 133 [this](cl::sycl::exception_list l) { 134 this->exception_caught_ = this->sycl_async_handler(l); 135 }, 136 num_threads) {} 137 138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS program()139 EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } 140 #endif 141 142 /// Attach an existing buffer to the pointer map, Eigen will not reuse it attach_buffer(cl::sycl::buffer<buffer_scalar_t,1> & buf)143 EIGEN_STRONG_INLINE void *attach_buffer( 144 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { 145 std::lock_guard<std::mutex> lock(pmapper_mutex_); 146 return static_cast<void *>(pMapper.add_pointer(buf)); 147 } 148 149 /// Detach previously attached buffer detach_buffer(void * p)150 EIGEN_STRONG_INLINE void detach_buffer(void *p) const { 151 std::lock_guard<std::mutex> lock(pmapper_mutex_); 152 TensorSycl::internal::SYCLfree<false>(p, pMapper); 153 } 154 155 /// Allocating device pointer. This pointer is actually an 8 bytes host 156 /// pointer used as key to access the sycl device buffer. The reason is that 157 /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode 158 /// expressions. So we create a key pointer to be used in Eigen expression 159 /// construction. When we convert the Eigen construction into the sycl 160 /// construction we use this pointer as a key in our buffer_map and we make 161 /// sure that we dedicate only one buffer only for this pointer. The device 162 /// pointer would be deleted by calling deallocate function. allocate(size_t num_bytes)163 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { 164 #if EIGEN_MAX_ALIGN_BYTES > 0 165 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; 166 if (align > 0) { 167 num_bytes += EIGEN_MAX_ALIGN_BYTES - align; 168 } 169 #endif 170 std::lock_guard<std::mutex> lock(pmapper_mutex_); 171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); 172 } 173 allocate_temp(size_t num_bytes)174 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { 175 #if EIGEN_MAX_ALIGN_BYTES > 0 176 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; 177 if (align > 0) { 178 num_bytes += EIGEN_MAX_ALIGN_BYTES - align; 179 } 180 #endif 181 std::lock_guard<std::mutex> lock(pmapper_mutex_); 182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 183 if (scratch_buffers.empty()) { 184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); 185 ; 186 } else { 187 for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { 188 auto buff = pMapper.get_buffer(*it); 189 if (buff.get_size() >= num_bytes) { 190 auto ptr = *it; 191 scratch_buffers.erase(it); 192 return ptr; 193 } else { 194 ++it; 195 } 196 } 197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); 198 } 199 #else 200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); 201 #endif 202 } 203 template <typename data_t> 204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< 205 cl::sycl::access::mode::read_write, data_t> get(data_t * data)206 get(data_t *data) const { 207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data); 208 } 209 template <typename data_t> get(TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,data_t> data)210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( 211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, 212 data_t> 213 data) const { 214 return static_cast<data_t *>(data.get_virtual_pointer()); 215 } 216 deallocate_temp(void * p)217 EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { 218 std::lock_guard<std::mutex> lock(pmapper_mutex_); 219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 220 scratch_buffers.insert(p); 221 #else 222 TensorSycl::internal::SYCLfree(p, pMapper); 223 #endif 224 } 225 template <cl::sycl::access::mode AcMd, typename T> deallocate_temp(const TensorSycl::internal::RangeAccess<AcMd,T> & p)226 EIGEN_STRONG_INLINE void deallocate_temp( 227 const TensorSycl::internal::RangeAccess<AcMd, T> &p) const { 228 deallocate_temp(p.get_virtual_pointer()); 229 } 230 231 /// This is used to deallocate the device pointer. p is used as a key inside 232 /// the map to find the device buffer and delete it. deallocate(void * p)233 EIGEN_STRONG_INLINE void deallocate(void *p) const { 234 std::lock_guard<std::mutex> lock(pmapper_mutex_); 235 TensorSycl::internal::SYCLfree(p, pMapper); 236 } 237 deallocate_all()238 EIGEN_STRONG_INLINE void deallocate_all() const { 239 std::lock_guard<std::mutex> lock(pmapper_mutex_); 240 TensorSycl::internal::SYCLfreeAll(pMapper); 241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 242 scratch_buffers.clear(); 243 #endif 244 } 245 246 /// The memcpyHostToDevice is used to copy the data from host to device 247 /// The destination pointer could be deleted before the copy happend which is 248 /// why a callback function is needed. By default if none is provided, the 249 /// function is blocking. memcpyHostToDevice(void * dst,const void * src,size_t n,std::function<void ()> callback)250 EIGEN_STRONG_INLINE void memcpyHostToDevice( 251 void *dst, const void *src, size_t n, 252 std::function<void()> callback) const { 253 static const auto write_mode = cl::sycl::access::mode::discard_write; 254 static const auto global_access = cl::sycl::access::target::global_buffer; 255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access> 256 write_accessor; 257 if (n == 0) { 258 if (callback) callback(); 259 return; 260 } 261 n /= sizeof(buffer_scalar_t); 262 auto f = [&](cl::sycl::handler &cgh) { 263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n); 264 buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src); 265 auto non_deleter = [](buffer_scalar_t const *) {}; 266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter); 267 cgh.copy(s_ptr, dst_acc); 268 }; 269 cl::sycl::event e; 270 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); 271 synchronize_and_callback(e, callback); 272 } 273 274 /// The memcpyDeviceToHost is used to copy the data from device to host. 275 /// The source pointer could be deleted before the copy happend which is 276 /// why a callback function is needed. By default if none is provided, the 277 /// function is blocking. memcpyDeviceToHost(void * dst,const void * src,size_t n,std::function<void ()> callback)278 EIGEN_STRONG_INLINE void memcpyDeviceToHost( 279 void *dst, const void *src, size_t n, 280 std::function<void()> callback) const { 281 static const auto read_mode = cl::sycl::access::mode::read; 282 static const auto global_access = cl::sycl::access::target::global_buffer; 283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access> 284 read_accessor; 285 if (n == 0) { 286 if (callback) callback(); 287 return; 288 } 289 n /= sizeof(buffer_scalar_t); 290 auto f = [&](cl::sycl::handler &cgh) { 291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n); 292 buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst); 293 auto non_deleter = [](buffer_scalar_t *) {}; 294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter); 295 cgh.copy(src_acc, s_ptr); 296 }; 297 cl::sycl::event e; 298 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); 299 synchronize_and_callback(e, callback); 300 } 301 302 /// The memcpy function. 303 /// No callback is required here as both arguments are on the device 304 /// and SYCL can handle the dependency. memcpy(void * dst,const void * src,size_t n)305 EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { 306 static const auto read_mode = cl::sycl::access::mode::read; 307 static const auto write_mode = cl::sycl::access::mode::discard_write; 308 if (n == 0) { 309 return; 310 } 311 n /= sizeof(buffer_scalar_t); 312 auto f = [&](cl::sycl::handler &cgh) { 313 auto src_acc = get_range_accessor<read_mode>(cgh, src, n); 314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n); 315 cgh.copy(src_acc, dst_acc); 316 }; 317 cl::sycl::event e; 318 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); 319 async_synchronize(e); 320 } 321 322 /// the memset function. 323 /// No callback is required here as both arguments are on the device 324 /// and SYCL can handle the dependency. memset(void * data,int c,size_t n)325 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { 326 static const auto write_mode = cl::sycl::access::mode::discard_write; 327 if (n == 0) { 328 return; 329 } 330 n /= sizeof(buffer_scalar_t); 331 auto f = [&](cl::sycl::handler &cgh) { 332 auto dst_acc = get_range_accessor<write_mode>(cgh, data, n); 333 // The cast to uint8_t is here to match the behaviour of the standard 334 // memset. The cast to buffer_scalar_t is needed to match the type of the 335 // accessor (in case buffer_scalar_t is not uint8_t) 336 cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c))); 337 }; 338 cl::sycl::event e; 339 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); 340 async_synchronize(e); 341 } 342 343 /// Get a range accessor to the virtual pointer's device memory. This range 344 /// accessor will allow access to the memory from the pointer to the end of 345 /// the buffer. 346 /// 347 /// NOTE: Inside a kernel the range accessor will always be indexed from the 348 /// start of the buffer, so the offset in the accessor is only used by 349 /// methods like handler::copy and will not be available inside a kernel. 350 template <cl::sycl::access::mode AcMd, typename T> 351 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> get_range_accessor(const void * ptr)352 get_range_accessor(const void *ptr) const { 353 static const auto global_access = cl::sycl::access::target::global_buffer; 354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t; 355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type; 356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; 357 358 std::lock_guard<std::mutex> lock(pmapper_mutex_); 359 360 auto original_buffer = pMapper.get_buffer(ptr); 361 const ptrdiff_t offset = pMapper.get_offset(ptr); 362 const ptrdiff_t typed_offset = offset / sizeof(T); 363 eigen_assert(typed_offset >= 0); 364 const auto typed_size = original_buffer.get_size() / sizeof(T); 365 auto buffer = original_buffer.template reinterpret< 366 typename Eigen::internal::remove_const<T>::type>( 367 cl::sycl::range<1>(typed_size)); 368 const ptrdiff_t size = buffer.get_count() - typed_offset; 369 eigen_assert(size >= 0); 370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type, 371 1, AcMd, global_access, is_place_holder> 372 placeholder_accessor_t; 373 const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset; 374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), 375 cl::sycl::id<1>(typed_offset)), 376 static_cast<size_t>(typed_offset), 377 reinterpret_cast<std::intptr_t>(start_ptr)); 378 } 379 380 /// Get a range accessor to the virtual pointer's device memory with a 381 /// specified size. 382 template <cl::sycl::access::mode AcMd, typename Index> 383 EIGEN_STRONG_INLINE cl::sycl::accessor< 384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> get_range_accessor(cl::sycl::handler & cgh,const void * ptr,const Index n_bytes)385 get_range_accessor(cl::sycl::handler &cgh, const void *ptr, 386 const Index n_bytes) const { 387 static const auto global_access = cl::sycl::access::target::global_buffer; 388 eigen_assert(n_bytes >= 0); 389 std::lock_guard<std::mutex> lock(pmapper_mutex_); 390 auto buffer = pMapper.get_buffer(ptr); 391 const ptrdiff_t offset = pMapper.get_offset(ptr); 392 eigen_assert(offset >= 0); 393 eigen_assert(offset + n_bytes <= buffer.get_size()); 394 return buffer.template get_access<AcMd, global_access>( 395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); 396 } 397 398 /// Creation of sycl accessor for a buffer. This function first tries to find 399 /// the buffer in the buffer_map. If found it gets the accessor from it, if 400 /// not, the function then adds an entry by creating a sycl buffer for that 401 /// particular pointer. 402 template <cl::sycl::access::mode AcMd> 403 EIGEN_STRONG_INLINE cl::sycl::accessor< 404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> get_sycl_accessor(cl::sycl::handler & cgh,const void * ptr)405 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { 406 std::lock_guard<std::mutex> lock(pmapper_mutex_); 407 return pMapper.get_buffer(ptr) 408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>( 409 cgh); 410 } 411 get_sycl_buffer(const void * ptr)412 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( 413 const void *ptr) const { 414 std::lock_guard<std::mutex> lock(pmapper_mutex_); 415 return pMapper.get_buffer(ptr); 416 } 417 get_offset(const void * ptr)418 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { 419 std::lock_guard<std::mutex> lock(pmapper_mutex_); 420 return pMapper.get_offset(ptr); 421 } 422 423 template <typename OutScalar, typename sycl_kernel, typename Lhs, 424 typename Rhs, typename OutPtr, typename Range, typename Index, 425 typename... T> binary_kernel_launcher(const Lhs & lhs,const Rhs & rhs,OutPtr outptr,Range thread_range,Index scratchSize,T...var)426 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, 427 const Rhs &rhs, OutPtr outptr, 428 Range thread_range, 429 Index scratchSize, 430 T... var) const { 431 auto kernel_functor = [=](cl::sycl::handler &cgh) { 432 // binding the placeholder accessors to a commandgroup handler 433 lhs.bind(cgh); 434 rhs.bind(cgh); 435 outptr.bind(cgh); 436 typedef cl::sycl::accessor<OutScalar, 1, 437 cl::sycl::access::mode::read_write, 438 cl::sycl::access::target::local> 439 LocalAccessor; 440 441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); 442 cgh.parallel_for( 443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 444 program().template get_kernel<sycl_kernel>(), 445 #endif 446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); 447 }; 448 cl::sycl::event e; 449 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); 450 async_synchronize(e); 451 } 452 453 template <typename OutScalar, typename sycl_kernel, typename InPtr, 454 typename OutPtr, typename Range, typename Index, typename... T> unary_kernel_launcher(const InPtr & inptr,OutPtr & outptr,Range thread_range,Index scratchSize,T...var)455 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, 456 OutPtr &outptr, 457 Range thread_range, 458 Index scratchSize, 459 T... var) const { 460 auto kernel_functor = [=](cl::sycl::handler &cgh) { 461 // binding the placeholder accessors to a commandgroup handler 462 inptr.bind(cgh); 463 outptr.bind(cgh); 464 typedef cl::sycl::accessor<OutScalar, 1, 465 cl::sycl::access::mode::read_write, 466 cl::sycl::access::target::local> 467 LocalAccessor; 468 469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); 470 cgh.parallel_for( 471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 472 program().template get_kernel<sycl_kernel>(), 473 #endif 474 thread_range, sycl_kernel(scratch, inptr, outptr, var...)); 475 }; 476 cl::sycl::event e; 477 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); 478 async_synchronize(e); 479 } 480 481 template <typename OutScalar, typename sycl_kernel, typename InPtr, 482 typename Range, typename Index, typename... T> nullary_kernel_launcher(const InPtr & inptr,Range thread_range,Index scratchSize,T...var)483 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, 484 Range thread_range, 485 Index scratchSize, 486 T... var) const { 487 auto kernel_functor = [=](cl::sycl::handler &cgh) { 488 // binding the placeholder accessors to a commandgroup handler 489 inptr.bind(cgh); 490 typedef cl::sycl::accessor<OutScalar, 1, 491 cl::sycl::access::mode::read_write, 492 cl::sycl::access::target::local> 493 LocalAccessor; 494 495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); 496 cgh.parallel_for( 497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 498 program().template get_kernel<sycl_kernel>(), 499 #endif 500 thread_range, sycl_kernel(scratch, inptr, var...)); 501 }; 502 cl::sycl::event e; 503 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); 504 async_synchronize(e); 505 } 506 507 synchronize()508 EIGEN_STRONG_INLINE void synchronize() const { 509 #ifdef EIGEN_EXCEPTIONS 510 m_queue.wait_and_throw(); 511 #else 512 m_queue.wait(); 513 #endif 514 } 515 516 async_synchronize(cl::sycl::event e)517 EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { 518 set_latest_event(e); 519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION 520 synchronize(); 521 #endif 522 } 523 524 template <typename Index> parallel_for_setup(Index n,Index & tileSize,Index & rng,Index & GRange)525 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, 526 Index &rng, Index &GRange) const { 527 tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); 528 tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * 529 EIGEN_SYCL_LOCAL_THREAD_DIM1), 530 static_cast<Index>(tileSize)); 531 rng = n; 532 if (rng == 0) rng = static_cast<Index>(1); 533 GRange = rng; 534 if (tileSize > GRange) 535 tileSize = GRange; 536 else if (GRange > tileSize) { 537 Index xMode = static_cast<Index>(GRange % tileSize); 538 if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); 539 } 540 } 541 542 /// This is used to prepare the number of threads and also the number of 543 /// threads per block for sycl kernels 544 template <typename Index> parallel_for_setup(const std::array<Index,2> & input_dim,cl::sycl::range<2> & global_range,cl::sycl::range<2> & local_range)545 EIGEN_STRONG_INLINE void parallel_for_setup( 546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, 547 cl::sycl::range<2> &local_range) const { 548 std::array<Index, 2> input_range = input_dim; 549 Index max_workgroup_Size = 550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); 551 max_workgroup_Size = 552 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * 553 EIGEN_SYCL_LOCAL_THREAD_DIM1), 554 static_cast<Index>(max_workgroup_Size)); 555 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); 556 local_range[1] = 557 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); 558 input_range[1] = input_dim[1]; 559 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); 560 global_range[1] = input_range[1]; 561 if (local_range[1] > global_range[1]) 562 local_range[1] = global_range[1]; 563 else if (global_range[1] > local_range[1]) { 564 Index xMode = static_cast<Index>(global_range[1] % local_range[1]); 565 if (xMode != 0) 566 global_range[1] += static_cast<Index>(local_range[1] - xMode); 567 } 568 local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]); 569 input_range[0] = input_dim[0]; 570 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); 571 global_range[0] = input_range[0]; 572 if (local_range[0] > global_range[0]) 573 local_range[0] = global_range[0]; 574 else if (global_range[0] > local_range[0]) { 575 Index xMode = static_cast<Index>(global_range[0] % local_range[0]); 576 if (xMode != 0) 577 global_range[0] += static_cast<Index>(local_range[0] - xMode); 578 } 579 } 580 581 /// This is used to prepare the number of threads and also the number of 582 /// threads per block for sycl kernels 583 template <typename Index> parallel_for_setup(const std::array<Index,3> & input_dim,cl::sycl::range<3> & global_range,cl::sycl::range<3> & local_range)584 EIGEN_STRONG_INLINE void parallel_for_setup( 585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, 586 cl::sycl::range<3> &local_range) const { 587 std::array<Index, 3> input_range = input_dim; 588 Index max_workgroup_Size = 589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); 590 max_workgroup_Size = 591 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * 592 EIGEN_SYCL_LOCAL_THREAD_DIM1), 593 static_cast<Index>(max_workgroup_Size)); 594 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); 595 local_range[2] = 596 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3))); 597 input_range[2] = input_dim[2]; 598 if (input_range[2] == 0) input_range[1] = static_cast<Index>(1); 599 global_range[2] = input_range[2]; 600 if (local_range[2] > global_range[2]) 601 local_range[2] = global_range[2]; 602 else if (global_range[2] > local_range[2]) { 603 Index xMode = static_cast<Index>(global_range[2] % local_range[2]); 604 if (xMode != 0) 605 global_range[2] += static_cast<Index>(local_range[2] - xMode); 606 } 607 pow_of_2 = static_cast<Index>( 608 std::log2(static_cast<Index>(max_workgroup_Size / local_range[2]))); 609 local_range[1] = 610 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); 611 input_range[1] = input_dim[1]; 612 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); 613 global_range[1] = input_range[1]; 614 if (local_range[1] > global_range[1]) 615 local_range[1] = global_range[1]; 616 else if (global_range[1] > local_range[1]) { 617 Index xMode = static_cast<Index>(global_range[1] % local_range[1]); 618 if (xMode != 0) 619 global_range[1] += static_cast<Index>(local_range[1] - xMode); 620 } 621 local_range[0] = static_cast<Index>(max_workgroup_Size / 622 (local_range[1] * local_range[2])); 623 input_range[0] = input_dim[0]; 624 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); 625 global_range[0] = input_range[0]; 626 if (local_range[0] > global_range[0]) 627 local_range[0] = global_range[0]; 628 else if (global_range[0] > local_range[0]) { 629 Index xMode = static_cast<Index>(global_range[0] % local_range[0]); 630 if (xMode != 0) 631 global_range[0] += static_cast<Index>(local_range[0] - xMode); 632 } 633 } 634 has_local_memory()635 EIGEN_STRONG_INLINE bool has_local_memory() const { 636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) 637 return false; 638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) 639 return true; 640 #else 641 return m_device_info.local_mem_type == 642 cl::sycl::info::local_mem_type::local; 643 #endif 644 } 645 max_buffer_size()646 EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { 647 return m_device_info.max_mem_alloc_size; 648 } 649 getNumSyclMultiProcessors()650 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { 651 return m_device_info.max_compute_units; 652 } 653 maxSyclThreadsPerBlock()654 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { 655 return m_device_info.max_work_group_size; 656 } 657 maxWorkItemSizes()658 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { 659 return m_device_info.max_work_item_sizes; 660 } 661 662 /// No need for sycl it should act the same as CPU version majorDeviceVersion()663 EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } 664 maxSyclThreadsPerMultiProcessor()665 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { 666 // OpenCL doesnot have such concept 667 return 2; 668 } 669 sharedMemPerBlock()670 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { 671 return m_device_info.local_mem_size; 672 } 673 674 // This function returns the nearest power of 2 Work-group size which is <= 675 // maximum device workgroup size. getNearestPowerOfTwoWorkGroupSize()676 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { 677 return getPowerOfTwo(m_device_info.max_work_group_size, false); 678 } 679 getPlatformName()680 EIGEN_STRONG_INLINE std::string getPlatformName() const { 681 return m_device_info.platform_name; 682 } 683 getDeviceName()684 EIGEN_STRONG_INLINE std::string getDeviceName() const { 685 return m_device_info.device_name; 686 } 687 getDeviceVendor()688 EIGEN_STRONG_INLINE std::string getDeviceVendor() const { 689 return m_device_info.device_vendor; 690 } 691 692 // This function returns the nearest power of 2 693 // if roundup is true returns result>=wgsize 694 // else it return result <= wgsize getPowerOfTwo(size_t wGSize,bool roundUp)695 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { 696 if (roundUp) --wGSize; 697 wGSize |= (wGSize >> 1); 698 wGSize |= (wGSize >> 2); 699 wGSize |= (wGSize >> 4); 700 wGSize |= (wGSize >> 8); 701 wGSize |= (wGSize >> 16); 702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 703 wGSize |= (wGSize >> 32); 704 #endif 705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); 706 } 707 sycl_queue()708 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } 709 710 // This function checks if the runtime recorded an error for the 711 // underlying stream device. ok()712 EIGEN_STRONG_INLINE bool ok() const { 713 if (!exception_caught_) { 714 synchronize(); 715 } 716 return !exception_caught_; 717 } 718 get_latest_event()719 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { 720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 721 std::lock_guard<std::mutex> lock(event_mutex_); 722 return latest_events_[std::this_thread::get_id()]; 723 #else 724 eigen_assert(false); 725 return cl::sycl::event(); 726 #endif 727 } 728 729 // destructor ~QueueInterface()730 ~QueueInterface() { 731 pMapper.clear(); 732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 733 scratch_buffers.clear(); 734 #endif 735 } 736 737 protected: set_latest_event(cl::sycl::event e)738 EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { 739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 740 std::lock_guard<std::mutex> lock(event_mutex_); 741 latest_events_[std::this_thread::get_id()] = e; 742 #else 743 EIGEN_UNUSED_VARIABLE(e); 744 #endif 745 } 746 synchronize_and_callback(cl::sycl::event e,const std::function<void ()> & callback)747 void synchronize_and_callback(cl::sycl::event e, 748 const std::function<void()> &callback) const { 749 set_latest_event(e); 750 if (callback) { 751 auto callback_ = [=]() { 752 #ifdef EIGEN_EXCEPTIONS 753 cl::sycl::event(e).wait_and_throw(); 754 #else 755 cl::sycl::event(e).wait(); 756 #endif 757 callback(); 758 }; 759 m_thread_pool.Schedule(std::move(callback_)); 760 } else { 761 #ifdef EIGEN_EXCEPTIONS 762 m_queue.wait_and_throw(); 763 #else 764 m_queue.wait(); 765 #endif 766 } 767 } 768 sycl_async_handler(cl::sycl::exception_list exceptions)769 bool sycl_async_handler(cl::sycl::exception_list exceptions) const { 770 bool exception_caught = false; 771 for (const auto &e : exceptions) { 772 if (e) { 773 exception_caught = true; 774 EIGEN_THROW_X(e); 775 } 776 } 777 return exception_caught; 778 } 779 780 /// class members: 781 bool exception_caught_ = false; 782 783 mutable std::mutex pmapper_mutex_; 784 785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 786 mutable std::mutex event_mutex_; 787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_; 788 #endif 789 790 /// std::map is the container used to make sure that we create only one buffer 791 /// per pointer. The lifespan of the buffer now depends on the lifespan of 792 /// SyclDevice. If a non-read-only pointer is needed to be accessed on the 793 /// host we should manually deallocate it. 794 mutable TensorSycl::internal::PointerMapper pMapper; 795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 796 mutable std::unordered_set<void *> scratch_buffers; 797 #endif 798 /// sycl queue 799 mutable cl::sycl::queue m_queue; 800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 801 mutable cl::sycl::program m_prog; 802 #endif 803 804 /// The thread pool is used to wait on events and call callbacks 805 /// asynchronously 806 mutable Eigen::ThreadPool m_thread_pool; 807 808 const TensorSycl::internal::SyclDeviceInfo m_device_info; 809 }; 810 811 struct SyclDeviceBase { 812 /// QueueInterface is not owned. it is the caller's responsibility to destroy 813 /// it 814 const QueueInterface *m_queue_stream; SyclDeviceBaseSyclDeviceBase815 explicit SyclDeviceBase(const QueueInterface *queue_stream) 816 : m_queue_stream(queue_stream) {} queue_streamSyclDeviceBase817 EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { 818 return m_queue_stream; 819 } 820 }; 821 822 // Here is a sycl device struct which accept the sycl queue interface 823 // as an input 824 struct SyclDevice : public SyclDeviceBase { SyclDeviceSyclDevice825 explicit SyclDevice(const QueueInterface *queue_stream) 826 : SyclDeviceBase(queue_stream) {} 827 828 // this is the accessor used to construct the evaluator 829 template <cl::sycl::access::mode AcMd, typename T> 830 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> get_range_accessorSyclDevice831 get_range_accessor(const void *ptr) const { 832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr); 833 } 834 835 // get sycl accessor 836 template <cl::sycl::access::mode AcMd> 837 EIGEN_STRONG_INLINE cl::sycl::accessor< 838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> get_sycl_accessorSyclDevice839 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { 840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr); 841 } 842 843 /// Accessing the created sycl device buffer for the device pointer get_sycl_bufferSyclDevice844 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( 845 const void *ptr) const { 846 return queue_stream()->get_sycl_buffer(ptr); 847 } 848 849 /// This is used to prepare the number of threads and also the number of 850 /// threads per block for sycl kernels 851 template <typename Index> parallel_for_setupSyclDevice852 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, 853 Index &rng, Index &GRange) const { 854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); 855 } 856 857 /// This is used to prepare the number of threads and also the number of 858 /// threads per block for sycl kernels 859 template <typename Index> parallel_for_setupSyclDevice860 EIGEN_STRONG_INLINE void parallel_for_setup( 861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, 862 cl::sycl::range<2> &local_range) const { 863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range); 864 } 865 866 /// This is used to prepare the number of threads and also the number of 867 /// threads per block for sycl kernels 868 template <typename Index> parallel_for_setupSyclDevice869 EIGEN_STRONG_INLINE void parallel_for_setup( 870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, 871 cl::sycl::range<3> &local_range) const { 872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range); 873 } 874 875 /// allocate device memory allocateSyclDevice876 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { 877 return queue_stream()->allocate(num_bytes); 878 } 879 allocate_tempSyclDevice880 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { 881 return queue_stream()->allocate_temp(num_bytes); 882 } 883 884 /// deallocate device memory deallocateSyclDevice885 EIGEN_STRONG_INLINE void deallocate(void *p) const { 886 queue_stream()->deallocate(p); 887 } 888 deallocate_tempSyclDevice889 EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { 890 queue_stream()->deallocate_temp(buffer); 891 } 892 template <cl::sycl::access::mode AcMd, typename T> deallocate_tempSyclDevice893 EIGEN_STRONG_INLINE void deallocate_temp( 894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const { 895 queue_stream()->deallocate_temp(buffer); 896 } deallocate_allSyclDevice897 EIGEN_STRONG_INLINE void deallocate_all() const { 898 queue_stream()->deallocate_all(); 899 } 900 901 template <typename data_t> 902 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< 903 cl::sycl::access::mode::read_write, data_t> getSyclDevice904 get(data_t *data) const { 905 return queue_stream()->get(data); 906 } 907 template <typename data_t> getSyclDevice908 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( 909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, 910 data_t> 911 data) const { 912 return queue_stream()->get(data); 913 } 914 915 /// attach existing buffer attach_bufferSyclDevice916 EIGEN_STRONG_INLINE void *attach_buffer( 917 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { 918 return queue_stream()->attach_buffer(buf); 919 } 920 /// detach buffer detach_bufferSyclDevice921 EIGEN_STRONG_INLINE void detach_buffer(void *p) const { 922 queue_stream()->detach_buffer(p); 923 } get_offsetSyclDevice924 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { 925 return queue_stream()->get_offset(ptr); 926 } 927 928 // some runtime conditions that can be applied here isDeviceSuitableSyclDevice929 EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } 930 931 /// memcpyHostToDevice 932 template <typename Index> 933 EIGEN_STRONG_INLINE void memcpyHostToDevice( 934 Index *dst, const Index *src, size_t n, 935 std::function<void()> callback = {}) const { 936 queue_stream()->memcpyHostToDevice(dst, src, n, callback); 937 } 938 /// memcpyDeviceToHost 939 template <typename Index> 940 EIGEN_STRONG_INLINE void memcpyDeviceToHost( 941 void *dst, const Index *src, size_t n, 942 std::function<void()> callback = {}) const { 943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback); 944 } 945 /// the memcpy function 946 template <typename Index> memcpySyclDevice947 EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { 948 queue_stream()->memcpy(dst, src, n); 949 } 950 /// the memset function memsetSyclDevice951 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { 952 queue_stream()->memset(data, c, n); 953 } 954 /// returning the sycl queue sycl_queueSyclDevice955 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { 956 return queue_stream()->sycl_queue(); 957 } 958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS programSyclDevice959 EIGEN_STRONG_INLINE cl::sycl::program &program() const { 960 return queue_stream()->program(); 961 } 962 #endif 963 firstLevelCacheSizeSyclDevice964 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } 965 lastLevelCacheSizeSyclDevice966 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { 967 // We won't try to take advantage of the l2 cache for the time being, and 968 // there is no l3 cache on sycl devices. 969 return firstLevelCacheSize(); 970 } getNumSyclMultiProcessorsSyclDevice971 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { 972 return queue_stream()->getNumSyclMultiProcessors(); 973 } maxSyclThreadsPerBlockSyclDevice974 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { 975 return queue_stream()->maxSyclThreadsPerBlock(); 976 } maxWorkItemSizesSyclDevice977 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { 978 return queue_stream()->maxWorkItemSizes(); 979 } maxSyclThreadsPerMultiProcessorSyclDevice980 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { 981 // OpenCL doesnot have such concept 982 return queue_stream()->maxSyclThreadsPerMultiProcessor(); 983 } sharedMemPerBlockSyclDevice984 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { 985 return queue_stream()->sharedMemPerBlock(); 986 } getNearestPowerOfTwoWorkGroupSizeSyclDevice987 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { 988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); 989 } 990 getPowerOfTwoSyclDevice991 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { 992 return queue_stream()->getPowerOfTwo(val, roundUp); 993 } 994 /// No need for sycl it should act the same as CPU version majorDeviceVersionSyclDevice995 EIGEN_STRONG_INLINE int majorDeviceVersion() const { 996 return queue_stream()->majorDeviceVersion(); 997 } 998 synchronizeSyclDevice999 EIGEN_STRONG_INLINE void synchronize() const { 1000 queue_stream()->synchronize(); 1001 } 1002 EIGEN_STRONG_INLINE void async_synchronize( 1003 cl::sycl::event e = cl::sycl::event()) const { 1004 queue_stream()->async_synchronize(e); 1005 } get_latest_eventSyclDevice1006 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { 1007 return queue_stream()->get_latest_event(); 1008 } 1009 1010 // This function checks if the runtime recorded an error for the 1011 // underlying stream device. okSyclDevice1012 EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } 1013 has_local_memorySyclDevice1014 EIGEN_STRONG_INLINE bool has_local_memory() const { 1015 return queue_stream()->has_local_memory(); 1016 } max_buffer_sizeSyclDevice1017 EIGEN_STRONG_INLINE long max_buffer_size() const { 1018 return queue_stream()->max_buffer_size(); 1019 } getPlatformNameSyclDevice1020 EIGEN_STRONG_INLINE std::string getPlatformName() const { 1021 return queue_stream()->getPlatformName(); 1022 } getDeviceNameSyclDevice1023 EIGEN_STRONG_INLINE std::string getDeviceName() const { 1024 return queue_stream()->getDeviceName(); 1025 } getDeviceVendorSyclDevice1026 EIGEN_STRONG_INLINE std::string getDeviceVendor() const { 1027 return queue_stream()->getDeviceVendor(); 1028 } 1029 template <typename OutScalar, typename KernelType, typename... T> binary_kernel_launcherSyclDevice1030 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { 1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>( 1032 var...); 1033 } 1034 template <typename OutScalar, typename KernelType, typename... T> unary_kernel_launcherSyclDevice1035 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { 1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>( 1037 var...); 1038 } 1039 1040 template <typename OutScalar, typename KernelType, typename... T> nullary_kernel_launcherSyclDevice1041 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { 1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>( 1043 var...); 1044 } 1045 }; 1046 } // end namespace Eigen 1047 1048 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H 1049