xref: /aosp_15_r20/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <[email protected]>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13 
14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
15 // so we'll use a macro to make clang happy.
16 #ifndef KERNEL_FRIEND
17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
19 #else
20 #define KERNEL_FRIEND friend
21 #endif
22 #endif
23 
24 
25 namespace Eigen {
26 
27 
28 /** \class TensorReduction
29   * \ingroup CXX11_Tensor_Module
30   *
31   * \brief Tensor reduction class.
32   *
33   */
34 
35 namespace internal {
36   template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
37   struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
38  : traits<XprType>
39 {
40   typedef traits<XprType> XprTraits;
41   typedef typename XprTraits::Scalar Scalar;
42   typedef typename XprTraits::StorageKind StorageKind;
43   typedef typename XprTraits::Index Index;
44   typedef typename XprType::Nested Nested;
45   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
46   static const int Layout = XprTraits::Layout;
47   typedef typename XprTraits::PointerType PointerType;
48 
49   template <class T> struct MakePointer {
50     // Intermediate typedef to workaround MSVC issue.
51     typedef MakePointer_<T> MakePointerT;
52     typedef typename MakePointerT::Type Type;
53   };
54 };
55 
56 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
57 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
58 {
59   typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
60 };
61 
62 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
63 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
64 {
65   typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
66 };
67 
68 
69 template <typename OutputDims> struct DimInitializer {
70   template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
71   static void run(const InputDims& input_dims,
72                   const array<bool, internal::array_size<InputDims>::value>& reduced,
73                   OutputDims* output_dims, ReducedDims* reduced_dims) {
74     const int NumInputDims = internal::array_size<InputDims>::value;
75     int outputIndex = 0;
76     int reduceIndex = 0;
77     for (int i = 0; i < NumInputDims; ++i) {
78       if (reduced[i]) {
79         (*reduced_dims)[reduceIndex] = input_dims[i];
80         ++reduceIndex;
81       } else {
82         (*output_dims)[outputIndex] = input_dims[i];
83         ++outputIndex;
84       }
85     }
86   }
87 };
88 
89 template <> struct DimInitializer<Sizes<> > {
90   template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
91   static void run(const InputDims& input_dims, const array<bool, Rank>&,
92                   Sizes<>*, array<Index, Rank>* reduced_dims) {
93     const int NumInputDims = internal::array_size<InputDims>::value;
94     for (int i = 0; i < NumInputDims; ++i) {
95       (*reduced_dims)[i] = input_dims[i];
96     }
97   }
98 };
99 
100 
101 template <typename ReducedDims, int NumTensorDims, int Layout>
102 struct are_inner_most_dims {
103   static const bool value = false;
104 };
105 template <typename ReducedDims, int NumTensorDims, int Layout>
106 struct preserve_inner_most_dims {
107   static const bool value = false;
108 };
109 
110 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
111 template <typename ReducedDims, int NumTensorDims>
112 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
113   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
114   static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
115   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
116   static const bool value = tmp1 & tmp2 & tmp3;
117 };
118 template <typename ReducedDims, int NumTensorDims>
119 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
120   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
121   static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
122   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
123   static const bool value = tmp1 & tmp2 & tmp3;
124 
125 };
126 template <typename ReducedDims, int NumTensorDims>
127 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
128   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
129   static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
130   static const bool value = tmp1 & tmp2;
131 
132 };
133 template <typename ReducedDims, int NumTensorDims>
134 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
135   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
136   static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
137   static const bool value = tmp1 & tmp2;
138 };
139 #endif
140 
141 
142 template <int DimIndex, typename Self, typename Op>
143 struct GenericDimReducer {
144   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
145     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
146     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
147       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
148       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
149     }
150   }
151 };
152 template <typename Self, typename Op>
153 struct GenericDimReducer<0, Self, Op> {
154   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
155     for (int j = 0; j < self.m_reducedDims[0]; ++j) {
156       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
157       reducer.reduce(self.m_impl.coeff(input), accum);
158     }
159   }
160 };
161 template <typename Self, typename Op>
162 struct GenericDimReducer<-1, Self, Op> {
163   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
164     reducer.reduce(self.m_impl.coeff(index), accum);
165   }
166 };
167 
168 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
169           bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
170                                    !Self::ReducerTraits::IsExactlyAssociative)>
171 struct InnerMostDimReducer {
172   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
173     typename Self::CoeffReturnType accum = reducer.initialize();
174     for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
175       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
176     }
177     return reducer.finalize(accum);
178   }
179 };
180 
181 template <typename Self, typename Op>
182 struct InnerMostDimReducer<Self, Op, true, false> {
183   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
184     const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
185     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
186     typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
187     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
188       reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
189     }
190     typename Self::CoeffReturnType accum = reducer.initialize();
191     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
192       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
193     }
194     return reducer.finalizeBoth(accum, paccum);
195   }
196 };
197 
198 #if !defined(EIGEN_HIPCC)
199 static const int kLeafSize = 1024;
200 
201 template <typename Self, typename Op>
202 struct InnerMostDimReducer<Self, Op, false, true> {
203   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
204   reduce(const Self& self, typename Self::Index firstIndex,
205          typename Self::Index numValuesToReduce, Op& reducer) {
206     typename Self::CoeffReturnType accum = reducer.initialize();
207     if (numValuesToReduce > kLeafSize) {
208       const typename Self::Index half = numValuesToReduce / 2;
209       reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
210       reducer.reduce(
211           reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
212           &accum);
213     } else {
214       for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
215         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
216       }
217     }
218     return reducer.finalize(accum);
219   }
220 };
221 
222 template <typename Self, typename Op>
223 struct InnerMostDimReducer<Self, Op, true, true> {
224   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
225   reduce(const Self& self, typename Self::Index firstIndex,
226          typename Self::Index numValuesToReduce, Op& reducer) {
227     const typename Self::Index packetSize =
228         internal::unpacket_traits<typename Self::PacketReturnType>::size;
229     typename Self::CoeffReturnType accum = reducer.initialize();
230     if (numValuesToReduce > packetSize * kLeafSize) {
231       // Make sure the split point is aligned on a packet boundary.
232       const typename Self::Index split =
233           packetSize *
234           divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
235                 packetSize);
236       const typename Self::Index num_left =
237           numext::mini(split - firstIndex, numValuesToReduce);
238       reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
239       if (num_left < numValuesToReduce) {
240         reducer.reduce(
241             reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
242       }
243       return reducer.finalize(accum);
244     } else {
245       const typename Self::Index UnrollSize =
246           (numValuesToReduce / (2*packetSize)) * 2*packetSize;
247       const typename Self::Index VectorizedSize =
248           (numValuesToReduce / packetSize) * packetSize;
249       typename Self::PacketReturnType paccum =
250           reducer.template initializePacket<typename Self::PacketReturnType>();
251       typename Self::PacketReturnType paccum2 =
252           reducer.template initializePacket<typename Self::PacketReturnType>();
253       for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
254         reducer.reducePacket(
255             self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
256         reducer.reducePacket(
257             self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
258             &paccum2);
259       }
260       for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
261         reducer.reducePacket(self.m_impl.template packet<Unaligned>(
262                                  firstIndex + j), &paccum);
263       }
264       reducer.reducePacket(paccum2, &paccum);
265       for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
266            ++j) {
267         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
268       }
269       return reducer.finalizeBoth(accum, paccum);
270     }
271   }
272 };
273 #endif
274 
275 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
276 struct InnerMostDimPreserver {
277   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
278     eigen_assert(false && "should never be called");
279   }
280 };
281 
282 template <int DimIndex, typename Self, typename Op>
283 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
284   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
285     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
286     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
287       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
288       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
289     }
290   }
291 };
292 
293 template <typename Self, typename Op>
294 struct InnerMostDimPreserver<0, Self, Op, true> {
295   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
296     for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
297       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
298       reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
299     }
300   }
301 };
302 template <typename Self, typename Op>
303 struct InnerMostDimPreserver<-1, Self, Op, true> {
304   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
305     eigen_assert(false && "should never be called");
306   }
307 };
308 
309 // Default full reducer
310 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
311 struct FullReducer {
312   static const bool HasOptimizedImplementation = false;
313 
314   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
315     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
316     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
317   }
318 };
319 
320 
321 #ifdef EIGEN_USE_THREADS
322 // Multithreaded full reducers
323 template <typename Self, typename Op,
324           bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
325 struct FullReducerShard {
326   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
327                   typename Self::Index numValuesToReduce, Op& reducer,
328                   typename Self::CoeffReturnType* output) {
329     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
330         self, firstIndex, numValuesToReduce, reducer);
331   }
332 };
333 
334 // Multithreaded full reducer
335 template <typename Self, typename Op, bool Vectorizable>
336 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
337   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
338   static const Index PacketSize =
339       unpacket_traits<typename Self::PacketReturnType>::size;
340 
341   // launch one reducer per thread and accumulate the result.
342   static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
343                   typename Self::CoeffReturnType* output) {
344     typedef typename Self::Index Index;
345     const Index num_coeffs = array_prod(self.m_impl.dimensions());
346     if (num_coeffs == 0) {
347       *output = reducer.finalize(reducer.initialize());
348       return;
349     }
350     const TensorOpCost cost =
351         self.m_impl.costPerCoeff(Vectorizable) +
352         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
353                      PacketSize);
354     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
355         num_coeffs, cost, device.numThreads());
356     if (num_threads == 1) {
357       *output =
358           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
359       return;
360     }
361     const Index blocksize =
362         std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
363     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
364     eigen_assert(num_coeffs >= numblocks * blocksize);
365 
366     Barrier barrier(internal::convert_index<unsigned int>(numblocks));
367     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
368     for (Index i = 0; i < numblocks; ++i) {
369       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
370                                   self, i * blocksize, blocksize, reducer,
371                                   &shards[i]);
372     }
373     typename Self::CoeffReturnType finalShard;
374     if (numblocks * blocksize < num_coeffs) {
375       finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
376           self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
377           reducer);
378     } else {
379       finalShard = reducer.initialize();
380     }
381     barrier.Wait();
382 
383     for (Index i = 0; i < numblocks; ++i) {
384       reducer.reduce(shards[i], &finalShard);
385     }
386     *output = reducer.finalize(finalShard);
387   }
388 };
389 
390 #endif
391 
392 
393 // Default inner reducer
394 template <typename Self, typename Op, typename Device>
395 struct InnerReducer {
396   static const bool HasOptimizedImplementation = false;
397 
398   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
399     eigen_assert(false && "Not implemented");
400     return true;
401   }
402 };
403 
404 // Default outer reducer
405 template <typename Self, typename Op, typename Device>
406 struct OuterReducer {
407   static const bool HasOptimizedImplementation = false;
408 
409   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
410     eigen_assert(false && "Not implemented");
411     return true;
412   }
413 };
414 
415 #ifdef EIGEN_USE_SYCL
416 // Default Generic reducer
417 template <typename Self, typename Op, typename Device>
418 struct GenericReducer {
419   static const bool HasOptimizedImplementation = false;
420 
421   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
422     eigen_assert(false && "Not implemented");
423     return true;
424   }
425 };
426 #endif
427 
428 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
429 template <int B, int N, typename S, typename R, typename I_>
430 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
431 
432 
433 #if defined(EIGEN_HAS_GPU_FP16)
434 template <typename S, typename R, typename I_>
435 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
436 template <int B, int N, typename S, typename R, typename I_>
437 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
438 template <int NPT, typename S, typename R, typename I_>
439 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
440 
441 #endif
442 
443 template <int NPT, typename S, typename R, typename I_>
444 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
445 
446 template <int NPT, typename S, typename R, typename I_>
447 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
448 #endif
449 
450 /**
451  * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
452  * This allows the reduction to have a different type for the accumulator than the input data type.
453  * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
454  * with the accumulator and the other for reducing two accumulators.
455  * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
456  * some properties of the input.
457  */
458 template <typename Op, typename CoeffReturnType>
459 struct ReductionReturnType {
460 #if defined(EIGEN_USE_SYCL)
461   typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
462 #else
463   typedef typename remove_const<CoeffReturnType>::type type;
464 #endif
465 };
466 
467 }  // end namespace internal
468 
469 
470 template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
471 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
472   public:
473     typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
474     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
475     typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
476     typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
477     typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
478     typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
479 
480     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
481     TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
482     { }
483     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
484     TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
485     { }
486 
487     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
488     const XprType& expression() const { return m_expr; }
489     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
490     const Dims& dims() const { return m_dims; }
491     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
492     const Op& reducer() const { return m_reducer; }
493 
494   protected:
495     typename XprType::Nested m_expr;
496     const Dims m_dims;
497     const Op m_reducer;
498 };
499 
500 template<typename ArgType, typename Device>
501 struct TensorReductionEvaluatorBase;
502 
503 // Eval as rvalue
504 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
505 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
506 {
507   typedef internal::reducer_traits<Op, Device> ReducerTraits;
508   typedef Dims ReducedDims;
509   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
510   typedef typename XprType::Index Index;
511   typedef ArgType ChildType;
512   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
513   static const int NumInputDims = internal::array_size<InputDimensions>::value;
514   static const int NumReducedDims = internal::array_size<Dims>::value;
515   static const int NumOutputDims = NumInputDims - NumReducedDims;
516   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
517   typedef typename XprType::Scalar Scalar;
518   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
519   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
520   typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
521   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
522   static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
523 
524   typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
525   typedef StorageMemory<CoeffReturnType, Device> Storage;
526   typedef typename Storage::Type EvaluatorPointerType;
527 
528     // Subset of strides of the input tensor for the non-reduced dimensions.
529   // Indexed by output dimensions.
530   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
531 
532   enum {
533     IsAligned = false,
534     PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
535     BlockAccess = false,
536     PreferBlockAccess = true,
537     Layout = TensorEvaluator<ArgType, Device>::Layout,
538     CoordAccess = false,  // to be implemented
539     RawAccess = false
540   };
541 
542   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
543 
544   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
545   typedef internal::TensorBlockNotImplemented TensorBlock;
546   //===--------------------------------------------------------------------===//
547 
548   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
549   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
550   static const bool RunningFullReduction = (NumOutputDims==0);
551 
552   EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
553       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
554   {
555     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
556     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
557                         YOU_MADE_A_PROGRAMMING_MISTAKE);
558 
559     // Build the bitmap indicating if an input dimension is reduced or not.
560     for (int i = 0; i < NumInputDims; ++i) {
561       m_reduced[i] = false;
562     }
563     for (int i = 0; i < NumReducedDims; ++i) {
564       eigen_assert(op.dims()[i] >= 0);
565       eigen_assert(op.dims()[i] < NumInputDims);
566       m_reduced[op.dims()[i]] = true;
567     }
568 
569     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
570     internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
571 
572     // Precompute output strides.
573     if (NumOutputDims > 0) {
574       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
575         m_outputStrides[0] = 1;
576         for (int i = 1; i < NumOutputDims; ++i) {
577           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
578           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
579         }
580       } else {
581         m_outputStrides[NumOutputDims - 1] = 1;
582         for (int i = NumOutputDims - 2; i >= 0; --i) {
583           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
584           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
585         }
586       }
587     }
588 
589     // Precompute input strides.
590     if (NumInputDims > 0) {
591       array<Index, NumInputDims> input_strides;
592       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
593         input_strides[0] = 1;
594         for (int i = 1; i < NumInputDims; ++i) {
595           input_strides[i] = input_strides[i-1] * input_dims[i-1];
596         }
597       } else {
598         input_strides.back() = 1;
599         for (int i = NumInputDims - 2; i >= 0; --i) {
600           input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
601         }
602       }
603 
604       int outputIndex = 0;
605       int reduceIndex = 0;
606       for (int i = 0; i < NumInputDims; ++i) {
607         if (m_reduced[i]) {
608           m_reducedStrides[reduceIndex] = input_strides[i];
609           ++reduceIndex;
610         } else {
611           m_preservedStrides[outputIndex] = input_strides[i];
612           m_output_to_input_dim_map[outputIndex] = i;
613           ++outputIndex;
614         }
615       }
616     }
617 
618     // Special case for full reductions
619     if (NumOutputDims == 0) {
620       m_preservedStrides[0] = internal::array_prod(input_dims);
621     }
622 
623     m_numValuesToReduce =
624         NumOutputDims == 0
625             ? internal::array_prod(input_dims)
626             : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
627                   ? m_preservedStrides[0]
628                   : m_preservedStrides[NumOutputDims - 1];
629   }
630 
631   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
632 
633   EIGEN_STRONG_INLINE
634   bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
635     // Use the FullReducer if possible.
636     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
637         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
638         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
639          !RunningOnGPU))) {
640       bool need_assign = false;
641       if (!data) {
642         m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
643         data = m_result;
644         need_assign = true;
645       }
646       Op reducer(m_reducer);
647       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
648       return need_assign;
649     }
650 
651     // Attempt to use an optimized reduction.
652     else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
653       bool reducing_inner_dims = true;
654       for (int i = 0; i < NumReducedDims; ++i) {
655         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
656           reducing_inner_dims &= m_reduced[i];
657         } else {
658           reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
659         }
660       }
661       if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
662           (reducing_inner_dims || ReducingInnerMostDims)) {
663         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
664         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
665         if (!data) {
666           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
667             data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
668             m_result = data;
669           }
670           else {
671             return true;
672           }
673         }
674         Op reducer(m_reducer);
675         // For SYCL this if always return false
676         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
677           if (m_result) {
678             m_device.deallocate_temp(m_result);
679             m_result = NULL;
680           }
681           return true;
682         } else {
683           return (m_result != NULL);
684         }
685       }
686 
687       bool preserving_inner_dims = true;
688       for (int i = 0; i < NumReducedDims; ++i) {
689         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
690           preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
691         } else {
692           preserving_inner_dims &= m_reduced[i];
693         }
694       }
695       if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
696           preserving_inner_dims) {
697         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
698         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
699         if (!data) {
700           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
701             data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
702             m_result = data;
703           }
704           else {
705             return true;
706           }
707         }
708         Op reducer(m_reducer);
709         // For SYCL this if always return false
710         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
711           if (m_result) {
712             m_device.deallocate_temp(m_result);
713             m_result = NULL;
714           }
715           return true;
716         } else {
717           return (m_result != NULL);
718         }
719       }
720       #if defined(EIGEN_USE_SYCL)
721       // If there is no Optimised version for SYCL, the reduction expression
722       // must break into two subexpression and use the SYCL generic Reducer on the device.
723       if(RunningOnSycl) {
724          const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
725          const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
726          if (!data) {
727            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
728            m_result = data;
729          }
730          Op reducer(m_reducer);
731          internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
732          return (m_result != NULL);
733        }
734       #endif
735     }
736     return true;
737   }
738 
739 #ifdef EIGEN_USE_THREADS
740   template <typename EvalSubExprsCallback>
741   EIGEN_STRONG_INLINE
742       void
743       evalSubExprsIfNeededAsync(EvaluatorPointerType data,
744                                 EvalSubExprsCallback done) {
745     m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
746       done(evalSubExprsIfNeededCommon(data));
747     });
748   }
749 #endif
750 
751   EIGEN_STRONG_INLINE
752   bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
753     m_impl.evalSubExprsIfNeeded(NULL);
754     return evalSubExprsIfNeededCommon(data);
755   }
756 
757   EIGEN_STRONG_INLINE void cleanup() {
758     m_impl.cleanup();
759     if (m_result) {
760       m_device.deallocate_temp(m_result);
761       m_result = NULL;
762     }
763   }
764 
765   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
766   {
767     if (( RunningFullReduction || RunningOnGPU) && m_result ) {
768       return *(m_result + index);
769     }
770     Op reducer(m_reducer);
771     if (ReducingInnerMostDims || RunningFullReduction) {
772       const Index num_values_to_reduce =
773         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
774       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
775                                                              num_values_to_reduce, reducer);
776     } else {
777       typename Self::CoeffReturnType accum = reducer.initialize();
778       internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
779       return reducer.finalize(accum);
780     }
781   }
782 
783   // TODO(bsteiner): provide a more efficient implementation.
784   template<int LoadMode>
785   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
786   {
787     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
788     eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
789 
790     if (RunningOnGPU && m_result) {
791       return internal::pload<PacketReturnType>(m_result + index);
792     }
793 
794     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
795     if (ReducingInnerMostDims) {
796       const Index num_values_to_reduce =
797         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
798       const Index firstIndex = firstInput(index);
799       for (Index i = 0; i < PacketSize; ++i) {
800         Op reducer(m_reducer);
801         values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
802                                                                     num_values_to_reduce, reducer);
803       }
804     } else if (PreservingInnerMostDims) {
805       const Index firstIndex = firstInput(index);
806       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
807       // TBD: extend this the the n innermost dimensions that we preserve.
808       if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
809         Op reducer(m_reducer);
810         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
811         internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
812         return reducer.finalizePacket(accum);
813       } else {
814         for (int i = 0; i < PacketSize; ++i) {
815           values[i] = coeff(index + i);
816         }
817       }
818     } else {
819       for (int i = 0; i < PacketSize; ++i) {
820         values[i] = coeff(index + i);
821       }
822     }
823     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
824     return rslt;
825   }
826 
827   // Must be called after evalSubExprsIfNeeded().
828   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
829     if (RunningFullReduction && m_result) {
830       return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
831     } else {
832       const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
833       const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
834       return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
835           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
836     }
837   }
838 
839   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
840   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
841   EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
842 #ifdef EIGEN_USE_SYCL
843   // binding placeholder accessors to a command group handler for SYCL
844   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
845     m_impl.bind(cgh);
846     m_result.bind(cgh);
847   }
848 #endif
849 
850   private:
851   template <int, typename, typename> friend struct internal::GenericDimReducer;
852   template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
853   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
854   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
855 #ifdef EIGEN_USE_THREADS
856   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
857 #endif
858 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
859   template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
860 #if defined(EIGEN_HAS_GPU_FP16)
861   template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
862   template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
863   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
864 #endif
865   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
866 
867   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
868 #endif
869 
870 #if defined(EIGEN_USE_SYCL)
871  template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
872  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
873  template <typename, typename, typename> friend struct internal::GenericReducer;
874 #endif
875 
876 
877   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
878 
879   struct BlockIteratorState {
880     Index input_dim;
881     Index output_size;
882     Index output_count;
883   };
884 
885   // Returns the Index in the input tensor of the first value that needs to be
886   // used to compute the reduction at output index "index".
887   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
888     if (ReducingInnerMostDims) {
889       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
890         return index * m_preservedStrides[0];
891       } else {
892         return index * m_preservedStrides[NumPreservedStrides - 1];
893       }
894     }
895     // TBD: optimize the case where we preserve the innermost dimensions.
896     Index startInput = 0;
897     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
898       for (int i = NumOutputDims - 1; i > 0; --i) {
899         // This is index_i in the output tensor.
900         const Index idx = index / m_outputStrides[i];
901         startInput += idx * m_preservedStrides[i];
902         index -= idx * m_outputStrides[i];
903       }
904       if (PreservingInnerMostDims) {
905         eigen_assert(m_preservedStrides[0] == 1);
906         startInput += index;
907       } else {
908         startInput += index * m_preservedStrides[0];
909       }
910     } else {
911       for (int i = 0; i < NumOutputDims - 1; ++i) {
912         // This is index_i in the output tensor.
913         const Index idx = index / m_outputStrides[i];
914         startInput += idx * m_preservedStrides[i];
915         index -= idx * m_outputStrides[i];
916       }
917       if (PreservingInnerMostDims) {
918         eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
919         startInput += index;
920       } else {
921         startInput += index * m_preservedStrides[NumPreservedStrides - 1];
922       }
923     }
924     return startInput;
925   }
926 
927   // Bitmap indicating if an input dimension is reduced or not.
928   array<bool, NumInputDims> m_reduced;
929   // Dimensions of the output of the operation.
930   Dimensions m_dimensions;
931   // Precomputed strides for the output tensor.
932   array<Index, NumOutputDims> m_outputStrides;
933   array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
934   array<Index, NumPreservedStrides> m_preservedStrides;
935   // Map from output to input dimension index.
936   array<Index, NumOutputDims> m_output_to_input_dim_map;
937   // How many values go into each reduction
938   Index m_numValuesToReduce;
939 
940   // Subset of strides of the input tensor for the reduced dimensions.
941   // Indexed by reduced dimensions.
942   array<Index, NumReducedDims> m_reducedStrides;
943   // Size of the input dimensions that are reduced.
944   // Indexed by reduced dimensions.
945   array<Index, NumReducedDims> m_reducedDims;
946 
947   // Evaluator for the input expression.
948   TensorEvaluator<ArgType, Device> m_impl;
949 
950   // Operation to apply for computing the reduction.
951   Op m_reducer;
952 
953   // For full reductions
954 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
955   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
956   static const bool RunningOnSycl = false;
957 #elif defined(EIGEN_USE_SYCL)
958 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
959 static const bool RunningOnGPU = false;
960 #else
961   static const bool RunningOnGPU = false;
962   static const bool RunningOnSycl = false;
963 #endif
964   EvaluatorPointerType m_result;
965 
966   const Device EIGEN_DEVICE_REF m_device;
967 };
968 
969 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
970 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
971 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
972   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
973   EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
974 };
975 
976 
977 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
978 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
979 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
980 
981   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
982   EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
983   // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
984   //Therefore the coeff function should be overridden by for SYCL kernel
985   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
986     return *(this->data() + index);
987   }
988   // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
989   //Therefore the packet function should be overridden by for SYCL kernel
990   template<int LoadMode>
991   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
992     return internal::pload<typename Base::PacketReturnType>(this->data() + index);
993   }
994 };
995 
996 } // end namespace Eigen
997 
998 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
999