1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Mehdi Goli Codeplay Software Ltd. 5 // Ralph Potter Codeplay Software Ltd. 6 // Luke Iwanski Codeplay Software Ltd. 7 // Contact: <[email protected]> 8 // 9 // This Source Code Form is subject to the terms of the Mozilla 10 // Public License v. 2.0. If a copy of the MPL was not distributed 11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 12 13 /***************************************************************** 14 * TensorScanSycl.h 15 * 16 * \brief: 17 * Tensor Scan Sycl implement the extend version of 18 * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. 19 * The algorithm requires up to 3 stage (consequently 3 kernels) depending on 20 * the size of the tensor. In the first kernel (ScanKernelFunctor), each 21 * threads within the work-group individually reduces the allocated elements per 22 * thread in order to reduces the total number of blocks. In the next step all 23 * thread within the work-group will reduce the associated blocks into the 24 * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary 25 * buffer is given as an input and all the threads within a work-group scan and 26 * reduces the boundaries between the blocks (generated from the previous 27 * kernel). and write the data on the temporary buffer. If the second kernel is 28 * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will 29 * adjust the final result into the output buffer. 30 * The original algorithm for the parallel prefix sum can be found here: 31 * 32 * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel 33 * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 34 *1, no. 1 (2008): 1-17. 35 *****************************************************************/ 36 37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP 38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP 39 40 namespace Eigen { 41 namespace TensorSycl { 42 namespace internal { 43 44 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE 45 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) 46 #endif 47 48 template <typename index_t> 49 struct ScanParameters { 50 // must be power of 2 51 static EIGEN_CONSTEXPR index_t ScanPerThread = 8; 52 const index_t total_size; 53 const index_t non_scan_size; 54 const index_t scan_size; 55 const index_t non_scan_stride; 56 const index_t scan_stride; 57 const index_t panel_threads; 58 const index_t group_threads; 59 const index_t block_threads; 60 const index_t elements_per_group; 61 const index_t elements_per_block; 62 const index_t loop_range; 63 ScanParametersScanParameters64 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, 65 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, 66 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) 67 : total_size(total_size_), 68 non_scan_size(non_scan_size_), 69 scan_size(scan_size_), 70 non_scan_stride(non_scan_stride_), 71 scan_stride(scan_stride_), 72 panel_threads(panel_threads_), 73 group_threads(group_threads_), 74 block_threads(block_threads_), 75 elements_per_group(elements_per_group_), 76 elements_per_block(elements_per_block_), 77 loop_range(loop_range_) {} 78 }; 79 80 enum class scan_step { first, second }; 81 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index, 82 scan_step stp> 83 struct ScanKernelFunctor { 84 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 85 LocalAccessor; 86 static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; 87 88 LocalAccessor scratch; 89 Evaluator dev_eval; 90 OutAccessor out_accessor; 91 OutAccessor temp_accessor; 92 const ScanParameters<Index> scanParameters; 93 Op accumulator; 94 const bool inclusive; ScanKernelFunctorScanKernelFunctor95 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, 96 OutAccessor out_accessor_, OutAccessor temp_accessor_, 97 const ScanParameters<Index> scanParameters_, Op accumulator_, 98 const bool inclusive_) 99 : scratch(scratch_), 100 dev_eval(dev_eval_), 101 out_accessor(out_accessor_), 102 temp_accessor(temp_accessor_), 103 scanParameters(scanParameters_), 104 accumulator(accumulator_), 105 inclusive(inclusive_) {} 106 107 template <scan_step sst = stp, typename Input> 108 typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC 109 EIGEN_STRONG_INLINE readScanKernelFunctor110 read(const Input &inpt, Index global_id) { 111 return inpt.coeff(global_id); 112 } 113 114 template <scan_step sst = stp, typename Input> 115 typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC 116 EIGEN_STRONG_INLINE readScanKernelFunctor117 read(const Input &inpt, Index global_id) { 118 return inpt[global_id]; 119 } 120 121 template <scan_step sst = stp, typename InclusiveOp> 122 typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_OperationScanKernelFunctor123 first_step_inclusive_Operation(InclusiveOp inclusive_op) { 124 inclusive_op(); 125 } 126 127 template <scan_step sst = stp, typename InclusiveOp> 128 typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_OperationScanKernelFunctor129 first_step_inclusive_Operation(InclusiveOp) {} 130 operatorScanKernelFunctor131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { 132 auto out_ptr = out_accessor.get_pointer(); 133 auto tmp_ptr = temp_accessor.get_pointer(); 134 auto scratch_ptr = scratch.get_pointer().get(); 135 136 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { 137 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); 138 Index tmp = data_offset % scanParameters.panel_threads; 139 const Index panel_id = data_offset / scanParameters.panel_threads; 140 const Index group_id = tmp / scanParameters.group_threads; 141 tmp = tmp % scanParameters.group_threads; 142 const Index block_id = tmp / scanParameters.block_threads; 143 const Index local_id = tmp % scanParameters.block_threads; 144 // we put one element per packet in scratch_mem 145 const Index scratch_stride = scanParameters.elements_per_block / PacketSize; 146 const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; 147 CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread]; 148 CoeffReturnType inclusive_scan; 149 // the actual panel size is scan_size * non_scan_size. 150 // elements_per_panel is roundup to power of 2 for binary tree 151 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; 152 const Index group_offset = group_id * scanParameters.non_scan_stride; 153 // This will be effective when the size is bigger than elements_per_block 154 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; 155 const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride); 156 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; 157 Index next_elements = 0; 158 EIGEN_UNROLL_LOOP 159 for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { 160 Index global_id = global_offset + next_elements; 161 private_scan[i] = ((((block_id * scanParameters.elements_per_block) + 162 (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) && 163 (global_id < scanParameters.total_size)) 164 ? read(dev_eval, global_id) 165 : accumulator.initialize(); 166 next_elements += scanParameters.scan_stride; 167 } 168 first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { 169 if (inclusive) { 170 inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1]; 171 } 172 }); 173 // This for loop must be 2 174 EIGEN_UNROLL_LOOP 175 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { 176 Index private_offset = 1; 177 // build sum in place up the tree 178 EIGEN_UNROLL_LOOP 179 for (Index d = PacketSize >> 1; d > 0; d >>= 1) { 180 EIGEN_UNROLL_LOOP 181 for (Index l = 0; l < d; l++) { 182 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; 183 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; 184 CoeffReturnType accum = accumulator.initialize(); 185 accumulator.reduce(private_scan[ai], &accum); 186 accumulator.reduce(private_scan[bi], &accum); 187 private_scan[bi] = accumulator.finalize(accum); 188 } 189 private_offset *= 2; 190 } 191 scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = 192 private_scan[PacketSize - 1 + packetIndex]; 193 private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); 194 // traverse down tree & build scan 195 EIGEN_UNROLL_LOOP 196 for (Index d = 1; d < PacketSize; d *= 2) { 197 private_offset >>= 1; 198 EIGEN_UNROLL_LOOP 199 for (Index l = 0; l < d; l++) { 200 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; 201 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; 202 CoeffReturnType accum = accumulator.initialize(); 203 accumulator.reduce(private_scan[ai], &accum); 204 accumulator.reduce(private_scan[bi], &accum); 205 private_scan[ai] = private_scan[bi]; 206 private_scan[bi] = accumulator.finalize(accum); 207 } 208 } 209 } 210 211 Index offset = 1; 212 // build sum in place up the tree 213 for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { 214 // Synchronise 215 itemID.barrier(cl::sycl::access::fence_space::local_space); 216 if (local_id < d) { 217 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; 218 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; 219 CoeffReturnType accum = accumulator.initialize(); 220 accumulator.reduce(scratch_ptr[ai], &accum); 221 accumulator.reduce(scratch_ptr[bi], &accum); 222 scratch_ptr[bi] = accumulator.finalize(accum); 223 } 224 offset *= 2; 225 } 226 // Synchronise 227 itemID.barrier(cl::sycl::access::fence_space::local_space); 228 // next step optimisation 229 if (local_id == 0) { 230 if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { 231 const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * 232 scanParameters.non_scan_size + 233 group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + 234 block_id; 235 tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; 236 } 237 // clear the last element 238 scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); 239 } 240 // traverse down tree & build scan 241 for (Index d = 1; d < scratch_stride; d *= 2) { 242 offset >>= 1; 243 // Synchronise 244 itemID.barrier(cl::sycl::access::fence_space::local_space); 245 if (local_id < d) { 246 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; 247 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; 248 CoeffReturnType accum = accumulator.initialize(); 249 accumulator.reduce(scratch_ptr[ai], &accum); 250 accumulator.reduce(scratch_ptr[bi], &accum); 251 scratch_ptr[ai] = scratch_ptr[bi]; 252 scratch_ptr[bi] = accumulator.finalize(accum); 253 } 254 } 255 // Synchronise 256 itemID.barrier(cl::sycl::access::fence_space::local_space); 257 // This for loop must be 2 258 EIGEN_UNROLL_LOOP 259 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { 260 EIGEN_UNROLL_LOOP 261 for (Index i = 0; i < PacketSize; i++) { 262 CoeffReturnType accum = private_scan[packetIndex + i]; 263 accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); 264 private_scan[packetIndex + i] = accumulator.finalize(accum); 265 } 266 } 267 first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { 268 if (inclusive) { 269 accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan); 270 private_scan[0] = accumulator.finalize(inclusive_scan); 271 } 272 }); 273 next_elements = 0; 274 // right the first set of private param 275 EIGEN_UNROLL_LOOP 276 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { 277 Index global_id = global_offset + next_elements; 278 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < 279 scanParameters.scan_size) && 280 (global_id < scanParameters.total_size)) { 281 Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive)); 282 out_ptr[global_id] = private_scan[private_id]; 283 } 284 next_elements += scanParameters.scan_stride; 285 } 286 } // end for loop 287 } 288 }; 289 290 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index> 291 struct ScanAdjustmentKernelFunctor { 292 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 293 LocalAccessor; 294 static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; 295 InAccessor in_accessor; 296 OutAccessor out_accessor; 297 const ScanParameters<Index> scanParameters; 298 Op accumulator; ScanAdjustmentKernelFunctorScanAdjustmentKernelFunctor299 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, 300 OutAccessor out_accessor_, 301 const ScanParameters<Index> scanParameters_, 302 Op accumulator_) 303 : in_accessor(in_accessor_), 304 out_accessor(out_accessor_), 305 scanParameters(scanParameters_), 306 accumulator(accumulator_) {} 307 operatorScanAdjustmentKernelFunctor308 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { 309 auto in_ptr = in_accessor.get_pointer(); 310 auto out_ptr = out_accessor.get_pointer(); 311 312 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { 313 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); 314 Index tmp = data_offset % scanParameters.panel_threads; 315 const Index panel_id = data_offset / scanParameters.panel_threads; 316 const Index group_id = tmp / scanParameters.group_threads; 317 tmp = tmp % scanParameters.group_threads; 318 const Index block_id = tmp / scanParameters.block_threads; 319 const Index local_id = tmp % scanParameters.block_threads; 320 321 // the actual panel size is scan_size * non_scan_size. 322 // elements_per_panel is roundup to power of 2 for binary tree 323 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; 324 const Index group_offset = group_id * scanParameters.non_scan_stride; 325 // This will be effective when the size is bigger than elements_per_block 326 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; 327 const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride; 328 329 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; 330 const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; 331 const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; 332 CoeffReturnType adjust_val = in_ptr[in_id]; 333 334 Index next_elements = 0; 335 EIGEN_UNROLL_LOOP 336 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { 337 Index global_id = global_offset + next_elements; 338 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < 339 scanParameters.scan_size) && 340 (global_id < scanParameters.total_size)) { 341 CoeffReturnType accum = adjust_val; 342 accumulator.reduce(out_ptr[global_id], &accum); 343 out_ptr[global_id] = accumulator.finalize(accum); 344 } 345 next_elements += scanParameters.scan_stride; 346 } 347 } 348 } 349 }; 350 351 template <typename Index> 352 struct ScanInfo { 353 const Index &total_size; 354 const Index &scan_size; 355 const Index &panel_size; 356 const Index &non_scan_size; 357 const Index &scan_stride; 358 const Index &non_scan_stride; 359 360 Index max_elements_per_block; 361 Index block_size; 362 Index panel_threads; 363 Index group_threads; 364 Index block_threads; 365 Index elements_per_group; 366 Index elements_per_block; 367 Index loop_range; 368 Index global_range; 369 Index local_range; 370 const Eigen::SyclDevice &dev; ScanInfoScanInfo371 EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, 372 const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, 373 const Eigen::SyclDevice &dev_) 374 : total_size(total_size_), 375 scan_size(scan_size_), 376 panel_size(panel_size_), 377 non_scan_size(non_scan_size_), 378 scan_stride(scan_stride_), 379 non_scan_stride(non_scan_stride_), 380 dev(dev_) { 381 // must be power of 2 382 local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), 383 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); 384 385 max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread; 386 387 elements_per_group = 388 dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true); 389 const Index elements_per_panel = elements_per_group * non_scan_size; 390 elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); 391 panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread; 392 group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread; 393 block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread; 394 block_size = elements_per_group / elements_per_block; 395 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE 396 const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); 397 #else 398 const Index max_threads = panel_threads * panel_size; 399 #endif 400 global_range = roundUp(max_threads, local_range); 401 loop_range = Index( 402 std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread))); 403 } get_scan_parameterScanInfo404 inline ScanParameters<Index> get_scan_parameter() { 405 return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, 406 group_threads, block_threads, elements_per_group, elements_per_block, loop_range); 407 } get_thread_rangeScanInfo408 inline cl::sycl::nd_range<1> get_thread_range() { 409 return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); 410 } 411 }; 412 413 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index> 414 struct SYCLAdjustBlockOffset { adjust_scan_block_offsetSYCLAdjustBlockOffset415 EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, 416 Reducer &accumulator, const Index total_size, 417 const Index scan_size, const Index panel_size, 418 const Index non_scan_size, const Index scan_stride, 419 const Index non_scan_stride, const Eigen::SyclDevice &dev) { 420 auto scan_info = 421 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); 422 423 typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index> 424 AdjustFuctor; 425 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(), 426 scan_info.max_elements_per_block, 427 scan_info.get_scan_parameter(), accumulator); 428 } 429 }; 430 431 template <typename CoeffReturnType, scan_step stp> 432 struct ScanLauncher_impl { 433 template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index> scan_blockScanLauncher_impl434 EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, 435 const Index total_size, const Index scan_size, const Index panel_size, 436 const Index non_scan_size, const Index scan_stride, 437 const Index non_scan_stride, const bool inclusive, 438 const Eigen::SyclDevice &dev) { 439 auto scan_info = 440 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); 441 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; 442 const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2); 443 CoeffReturnType *temp_pointer = 444 static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); 445 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); 446 447 typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor; 448 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>( 449 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, 450 scan_info.get_scan_parameter(), accumulator, inclusive); 451 452 if (scan_info.block_size > 1) { 453 ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block( 454 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, 455 non_scan_size, Index(1), scan_info.block_size, false, dev); 456 457 SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset( 458 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, 459 non_scan_stride, dev); 460 } 461 dev.deallocate_temp(temp_pointer); 462 } 463 }; 464 465 } // namespace internal 466 } // namespace TensorSycl 467 namespace internal { 468 template <typename Self, typename Reducer, bool vectorize> 469 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> { 470 typedef typename Self::Index Index; 471 typedef typename Self::CoeffReturnType CoeffReturnType; 472 typedef typename Self::Storage Storage; 473 typedef typename Self::EvaluatorPointerType EvaluatorPointerType; 474 void operator()(Self &self, EvaluatorPointerType data) { 475 const Index total_size = internal::array_prod(self.dimensions()); 476 const Index scan_size = self.size(); 477 const Index scan_stride = self.stride(); 478 // this is the scan op (can be sum or ...) 479 auto accumulator = self.accumulator(); 480 auto inclusive = !self.exclusive(); 481 auto consume_dim = self.consume_dim(); 482 auto dev = self.device(); 483 484 auto dims = self.inner().dimensions(); 485 486 Index non_scan_size = 1; 487 Index panel_size = 1; 488 if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) { 489 for (int i = 0; i < consume_dim; i++) { 490 non_scan_size *= dims[i]; 491 } 492 for (int i = consume_dim + 1; i < Self::NumDims; i++) { 493 panel_size *= dims[i]; 494 } 495 } else { 496 for (int i = Self::NumDims - 1; i > consume_dim; i--) { 497 non_scan_size *= dims[i]; 498 } 499 for (int i = consume_dim - 1; i >= 0; i--) { 500 panel_size *= dims[i]; 501 } 502 } 503 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; 504 auto eval_impl = self.inner(); 505 TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block( 506 eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, 507 inclusive, dev); 508 } 509 }; 510 } // namespace internal 511 } // namespace Eigen 512 513 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP 514