1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Mehdi Goli Codeplay Software Ltd. 5 // Ralph Potter Codeplay Software Ltd. 6 // Luke Iwanski Codeplay Software Ltd. 7 // Contact: <[email protected]> 8 // 9 // This Source Code Form is subject to the terms of the Mozilla 10 // Public License v. 2.0. If a copy of the MPL was not distributed 11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 12 13 /***************************************************************** 14 * TensorReductionSycl.h 15 * 16 * \brief: 17 * This is the specialization of the reduction operation. Two phase reduction approach 18 * is used since the GPU does not have Global Synchronization for global memory among 19 * different work-group/thread block. To solve the problem, we need to create two kernels 20 * to reduce the data, where the first kernel reduce the data locally and each local 21 * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) 22 * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 23 * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: 24 * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf 25 * 26 *****************************************************************/ 27 28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 30 namespace Eigen { 31 namespace TensorSycl { 32 namespace internal { 33 34 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable> 35 struct OpDefiner { 36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType; 37 typedef Op type; get_opOpDefiner38 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } 39 finalise_opOpDefiner40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, 41 const Index &) { 42 return accumulator; 43 } 44 }; 45 46 template <typename CoeffReturnType, typename Index> 47 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> { 48 typedef Eigen::internal::SumReducer<CoeffReturnType> type; 49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { 50 return type(); 51 } 52 53 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, 54 const Index &scale) { 55 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op; 56 return quotient_op(accumulator, CoeffReturnType(scale)); 57 } 58 }; 59 60 template <typename CoeffReturnType, typename Index> 61 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> { 62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType; 63 typedef Eigen::internal::SumReducer<CoeffReturnType> type; 64 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { 65 return type(); 66 } 67 68 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, 69 const Index &scale) { 70 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale))); 71 } 72 }; 73 74 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index, 75 Index local_range> 76 struct SecondStepFullReducer { 77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 78 LocalAccessor; 79 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef; 80 typedef typename OpDef::type Op; 81 LocalAccessor scratch; 82 InputAccessor aI; 83 OutputAccessor outAcc; 84 Op op; 85 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) 86 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} 87 88 void operator()(cl::sycl::nd_item<1> itemID) { 89 // Our empirical research shows that the best performance will be achieved 90 // when there is only one element per thread to reduce in the second step. 91 // in this step the second step reduction time is almost negligible. 92 // Hence, in the second step of reduction the input size is fixed to the 93 // local size, thus, there is only one element read per thread. The 94 // algorithm must be changed if the number of reduce per thread in the 95 // second step is greater than 1. Otherwise, the result will be wrong. 96 const Index localid = itemID.get_local_id(0); 97 auto aInPtr = aI.get_pointer() + localid; 98 auto aOutPtr = outAcc.get_pointer(); 99 CoeffReturnType *scratchptr = scratch.get_pointer(); 100 CoeffReturnType accumulator = *aInPtr; 101 102 scratchptr[localid] = op.finalize(accumulator); 103 for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { 104 itemID.barrier(cl::sycl::access::fence_space::local_space); 105 if (localid < offset) { 106 op.reduce(scratchptr[localid + offset], &accumulator); 107 scratchptr[localid] = op.finalize(accumulator); 108 } 109 } 110 if (localid == 0) *aOutPtr = op.finalize(accumulator); 111 } 112 }; 113 114 // Full reduction first phase. In this version the vectorization is true and the reduction accept 115 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). 116 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range> 117 class FullReductionKernelFunctor { 118 public: 119 typedef typename Evaluator::CoeffReturnType CoeffReturnType; 120 typedef typename Evaluator::Index Index; 121 typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index, 122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> 123 OpDef; 124 125 typedef typename OpDef::type Op; 126 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; 127 typedef typename Evaluator::PacketReturnType PacketReturnType; 128 typedef 129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), 130 PacketReturnType, CoeffReturnType>::type OutType; 131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 132 LocalAccessor; 133 LocalAccessor scratch; 134 Evaluator evaluator; 135 EvaluatorPointerType final_output; 136 Index rng; 137 Op op; 138 139 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, 140 Index rng_, OpType op_) 141 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} 142 143 void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } 144 145 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> 146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction( 147 const cl::sycl::nd_item<1> &itemID) { 148 auto output_ptr = final_output.get_pointer(); 149 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; 150 Index globalid = itemID.get_global_id(0); 151 Index localid = itemID.get_local_id(0); 152 Index step = Evaluator::PacketSize * itemID.get_global_range(0); 153 Index start = Evaluator::PacketSize * globalid; 154 // vectorizable parts 155 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>(); 156 for (Index i = start; i < VectorizedRange; i += step) { 157 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator); 158 } 159 globalid += VectorizedRange; 160 // non vectorizable parts 161 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { 162 op.template reducePacket<PacketReturnType>( 163 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type( 164 evaluator.impl().coeff(i), op.initialize()), 165 &packetAccumulator); 166 } 167 scratch[localid] = packetAccumulator = 168 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng); 169 // reduction parts // Local size is always power of 2 170 EIGEN_UNROLL_LOOP 171 for (Index offset = local_range / 2; offset > 0; offset /= 2) { 172 itemID.barrier(cl::sycl::access::fence_space::local_space); 173 if (localid < offset) { 174 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator); 175 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator); 176 } 177 } 178 if (localid == 0) { 179 output_ptr[itemID.get_group(0)] = 180 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator)); 181 } 182 } 183 184 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> 185 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction( 186 const cl::sycl::nd_item<1> &itemID) { 187 auto output_ptr = final_output.get_pointer(); 188 Index globalid = itemID.get_global_id(0); 189 Index localid = itemID.get_local_id(0); 190 // vectorizable parts 191 CoeffReturnType accumulator = op.initialize(); 192 // non vectorizable parts 193 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { 194 op.reduce(evaluator.impl().coeff(i), &accumulator); 195 } 196 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); 197 198 // reduction parts. the local size is always power of 2 199 EIGEN_UNROLL_LOOP 200 for (Index offset = local_range / 2; offset > 0; offset /= 2) { 201 itemID.barrier(cl::sycl::access::fence_space::local_space); 202 if (localid < offset) { 203 op.reduce(scratch[localid + offset], &accumulator); 204 scratch[localid] = op.finalize(accumulator); 205 } 206 } 207 if (localid == 0) { 208 output_ptr[itemID.get_group(0)] = op.finalize(accumulator); 209 } 210 } 211 }; 212 213 template <typename Evaluator, typename OpType> 214 class GenericNondeterministicReducer { 215 public: 216 typedef typename Evaluator::CoeffReturnType CoeffReturnType; 217 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; 218 typedef typename Evaluator::Index Index; 219 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; 220 typedef typename OpDef::type Op; 221 template <typename Scratch> 222 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, 223 Index range_, Index num_values_to_reduce_) 224 : evaluator(evaluator_), 225 output_accessor(output_accessor_), 226 functor(OpDef::get_op(functor_)), 227 range(range_), 228 num_values_to_reduce(num_values_to_reduce_) {} 229 230 void operator()(cl::sycl::nd_item<1> itemID) { 231 auto output_accessor_ptr = output_accessor.get_pointer(); 232 /// const cast added as a naive solution to solve the qualifier drop error 233 Index globalid = static_cast<Index>(itemID.get_global_linear_id()); 234 if (globalid < range) { 235 CoeffReturnType accum = functor.initialize(); 236 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce( 237 evaluator, evaluator.firstInput(globalid), functor, &accum); 238 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); 239 } 240 } 241 242 private: 243 Evaluator evaluator; 244 EvaluatorPointerType output_accessor; 245 Op functor; 246 Index range; 247 Index num_values_to_reduce; 248 }; 249 250 enum class reduction_dim { inner_most, outer_most }; 251 // default is preserver 252 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt> 253 struct PartialReductionKernel { 254 typedef typename Evaluator::CoeffReturnType CoeffReturnType; 255 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; 256 typedef typename Evaluator::Index Index; 257 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; 258 typedef typename OpDef::type Op; 259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 260 ScratchAcc; 261 ScratchAcc scratch; 262 Evaluator evaluator; 263 EvaluatorPointerType output_accessor; 264 Op op; 265 const Index preserve_elements_num_groups; 266 const Index reduce_elements_num_groups; 267 const Index num_coeffs_to_preserve; 268 const Index num_coeffs_to_reduce; 269 270 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, 271 const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, 272 const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) 273 : scratch(scratch_), 274 evaluator(evaluator_), 275 output_accessor(output_accessor_), 276 op(OpDef::get_op(op_)), 277 preserve_elements_num_groups(preserve_elements_num_groups_), 278 reduce_elements_num_groups(reduce_elements_num_groups_), 279 num_coeffs_to_preserve(num_coeffs_to_preserve_), 280 num_coeffs_to_reduce(num_coeffs_to_reduce_) {} 281 282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, 283 CoeffReturnType &accumulator) { 284 if (globalPId >= num_coeffs_to_preserve) { 285 return; 286 } 287 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) 288 : globalRId + (globalPId * num_coeffs_to_reduce); 289 Index localOffset = globalRId; 290 291 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; 292 const Index per_thread_global_stride = 293 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; 294 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { 295 op.reduce(evaluator.impl().coeff(global_offset), &accumulator); 296 localOffset += per_thread_local_stride; 297 global_offset += per_thread_global_stride; 298 } 299 } 300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { 301 const Index linearLocalThreadId = itemID.get_local_id(0); 302 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP 303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR; 304 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP 305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR; 306 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups 307 : itemID.get_group(0) / reduce_elements_num_groups; 308 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups 309 : itemID.get_group(0) % reduce_elements_num_groups; 310 311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; 312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; 313 auto scratchPtr = scratch.get_pointer().get(); 314 auto outPtr = 315 output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); 316 CoeffReturnType accumulator = op.initialize(); 317 318 element_wise_reduce(globalRId, globalPId, accumulator); 319 320 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); 321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = 322 accumulator; 323 if (rt == reduction_dim::inner_most) { 324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; 325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; 326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; 327 } 328 329 /* Apply the reduction operation between the current local 330 * id and the one on the other half of the vector. */ 331 auto out_scratch_ptr = 332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); 333 itemID.barrier(cl::sycl::access::fence_space::local_space); 334 if (rt == reduction_dim::inner_most) { 335 accumulator = *out_scratch_ptr; 336 } 337 // The Local LocalThreadSizeR is always power of 2 338 EIGEN_UNROLL_LOOP 339 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { 340 if (rLocalThreadId < offset) { 341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); 342 // The result has already been divided for mean reducer in the 343 // previous reduction so no need to divide furthermore 344 *out_scratch_ptr = op.finalize(accumulator); 345 } 346 /* All threads collectively read from global memory into local. 347 * The barrier ensures all threads' IO is resolved before 348 * execution continues (strictly speaking, all threads within 349 * a single work-group - there is no co-ordination between 350 * work-groups, only work-items). */ 351 itemID.barrier(cl::sycl::access::fence_space::local_space); 352 } 353 354 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { 355 outPtr[globalPId] = op.finalize(accumulator); 356 } 357 } 358 }; 359 360 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType> 361 struct SecondStepPartialReduction { 362 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef; 363 typedef typename OpDef::type Op; 364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 365 ScratchAccessor; 366 InputAccessor input_accessor; 367 OutputAccessor output_accessor; 368 Op op; 369 const Index num_coeffs_to_preserve; 370 const Index num_coeffs_to_reduce; 371 372 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, 373 OutputAccessor output_accessor_, OpType op_, 374 const Index num_coeffs_to_preserve_, 375 const Index num_coeffs_to_reduce_) 376 : input_accessor(input_accessor_), 377 output_accessor(output_accessor_), 378 op(OpDef::get_op(op_)), 379 num_coeffs_to_preserve(num_coeffs_to_preserve_), 380 num_coeffs_to_reduce(num_coeffs_to_reduce_) {} 381 382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { 383 const Index globalId = itemID.get_global_id(0); 384 385 if (globalId >= num_coeffs_to_preserve) return; 386 387 auto in_ptr = input_accessor.get_pointer() + globalId; 388 389 OutScalar accumulator = op.initialize(); 390 // num_coeffs_to_reduce is not bigger that 256 391 for (Index i = 0; i < num_coeffs_to_reduce; i++) { 392 op.reduce(*in_ptr, &accumulator); 393 in_ptr += num_coeffs_to_preserve; 394 } 395 output_accessor.get_pointer()[globalId] = op.finalize(accumulator); 396 } 397 }; // namespace internal 398 399 template <typename Index, Index LTP, Index LTR, bool BC_> 400 struct ReductionPannel { 401 static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; 402 static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; 403 static EIGEN_CONSTEXPR bool BC = BC_; 404 }; 405 406 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt> 407 struct PartialReducerLauncher { 408 typedef typename Self::EvaluatorPointerType EvaluatorPointerType; 409 typedef typename Self::CoeffReturnType CoeffReturnType; 410 typedef typename Self::Storage Storage; 411 typedef typename Self::Index Index; 412 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true> 413 PannelParameters; 414 415 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType; 416 417 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, 418 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { 419 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); 420 421 // getPowerOfTwo makes sure local range is power of 2 and <= 422 // maxSyclThreadPerBlock this will help us to avoid extra check on the 423 // kernel 424 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & 425 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), 426 "The Local thread size must be a power of 2 for the reduction " 427 "operation"); 428 429 EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; 430 // In this step, we force the code not to be more than 2-step reduction: 431 // Our empirical research shows that if each thread reduces at least 64 432 // elemnts individually, we get better performance. However, this can change 433 // on different platforms. In this step we force the code not to be 434 // morthan step reduction: Our empirical research shows that for inner_most 435 // dim reducer, it is better to have 8 group in a reduce dimension for sizes 436 // > 1024 to achieve the best performance. 437 const Index reductionPerThread = 64; 438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); 439 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; 440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups; 441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; 442 const Index globalRange = pNumGroups * rNumGroups * localRange; 443 444 EIGEN_CONSTEXPR Index scratchSize = 445 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); 446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); 447 if (rNumGroups > 1) { 448 CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>( 449 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); 450 EvaluatorPointerType temp_accessor = dev.get(temp_pointer); 451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( 452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, 453 num_coeffs_to_reduce); 454 455 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op> 456 SecondStepPartialReductionKernel; 457 458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>( 459 temp_accessor, output, 460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), 461 reducer, num_coeffs_to_preserve, rNumGroups); 462 463 self.device().deallocate_temp(temp_pointer); 464 } else { 465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( 466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, 467 num_coeffs_to_reduce); 468 } 469 return false; 470 } 471 }; 472 } // namespace internal 473 } // namespace TensorSycl 474 475 namespace internal { 476 477 template <typename Self, typename Op, bool Vectorizable> 478 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> { 479 typedef typename Self::CoeffReturnType CoeffReturnType; 480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType; 481 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; 482 static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; 483 static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { 484 typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType; 485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & 486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), 487 "The Local thread size must be a power of 2 for the reduction " 488 "operation"); 489 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; 490 491 typename Self::Index inputSize = self.impl().dimensions().TotalSize(); 492 // In this step we force the code not to be more than 2-step reduction: 493 // Our empirical research shows that if each thread reduces at least 512 494 // elemnts individually, we get better performance. 495 const Index reductionPerThread = 2048; 496 // const Index num_work_group = 497 Index reductionGroup = dev.getPowerOfTwo( 498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); 499 const Index num_work_group = std::min(reductionGroup, local_range); 500 // 1 501 // ? local_range 502 // : 1); 503 const Index global_range = num_work_group * local_range; 504 505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); 506 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t; 507 if (num_work_group > 1) { 508 CoeffReturnType *temp_pointer = 509 static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); 510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); 511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range, 512 local_range, inputSize, reducer); 513 514 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType, 515 EvaluatorPointerType, Index, local_range> 516 GenericRKernel; 517 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>( 518 tmp_global_accessor, data, 519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, 520 reducer); 521 522 dev.deallocate_temp(temp_pointer); 523 } else { 524 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize, 525 reducer); 526 } 527 } 528 }; 529 // vectorizable inner_most most dim preserver 530 // col reduction 531 template <typename Self, typename Op> 532 struct OuterReducer<Self, Op, Eigen::SyclDevice> { 533 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; 534 535 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, 536 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, 537 typename Self::Index num_coeffs_to_preserve) { 538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher< 539 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, 540 num_coeffs_to_reduce, 541 num_coeffs_to_preserve); 542 } 543 }; 544 // row reduction 545 template <typename Self, typename Op> 546 struct InnerReducer<Self, Op, Eigen::SyclDevice> { 547 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; 548 549 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, 550 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, 551 typename Self::Index num_coeffs_to_preserve) { 552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher< 553 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, 554 num_coeffs_to_reduce, 555 num_coeffs_to_preserve); 556 } 557 }; 558 559 // ArmgMax uses this kernel for partial reduction// 560 // TODO(@mehdi.goli) come up with a better kernel 561 // generic partial reduction 562 template <typename Self, typename Op> 563 struct GenericReducer<Self, Op, Eigen::SyclDevice> { 564 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; 565 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, 566 typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, 567 typename Self::Index num_coeffs_to_preserve) { 568 typename Self::Index range, GRange, tileSize; 569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); 570 571 dev.template unary_kernel_launcher<typename Self::CoeffReturnType, 572 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>( 573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), 574 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)); 575 return false; 576 } 577 }; 578 579 } // namespace internal 580 } // namespace Eigen 581 582 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 583