1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <[email protected]> 5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <[email protected]> 6 // 7 // This Source Code Form is subject to the terms of the Mozilla 8 // Public License v. 2.0. If a copy of the MPL was not distributed 9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10 11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H 12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H 13 14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend, 15 // so we'll use a macro to make clang happy. 16 #ifndef KERNEL_FRIEND 17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) 18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 19 #else 20 #define KERNEL_FRIEND friend 21 #endif 22 #endif 23 24 25 namespace Eigen { 26 27 28 /** \class TensorReduction 29 * \ingroup CXX11_Tensor_Module 30 * 31 * \brief Tensor reduction class. 32 * 33 */ 34 35 namespace internal { 36 template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ > 37 struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > 38 : traits<XprType> 39 { 40 typedef traits<XprType> XprTraits; 41 typedef typename XprTraits::Scalar Scalar; 42 typedef typename XprTraits::StorageKind StorageKind; 43 typedef typename XprTraits::Index Index; 44 typedef typename XprType::Nested Nested; 45 static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; 46 static const int Layout = XprTraits::Layout; 47 typedef typename XprTraits::PointerType PointerType; 48 49 template <class T> struct MakePointer { 50 // Intermediate typedef to workaround MSVC issue. 51 typedef MakePointer_<T> MakePointerT; 52 typedef typename MakePointerT::Type Type; 53 }; 54 }; 55 56 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 57 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> 58 { 59 typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type; 60 }; 61 62 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 63 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> 64 { 65 typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type; 66 }; 67 68 69 template <typename OutputDims> struct DimInitializer { 70 template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC 71 static void run(const InputDims& input_dims, 72 const array<bool, internal::array_size<InputDims>::value>& reduced, 73 OutputDims* output_dims, ReducedDims* reduced_dims) { 74 const int NumInputDims = internal::array_size<InputDims>::value; 75 int outputIndex = 0; 76 int reduceIndex = 0; 77 for (int i = 0; i < NumInputDims; ++i) { 78 if (reduced[i]) { 79 (*reduced_dims)[reduceIndex] = input_dims[i]; 80 ++reduceIndex; 81 } else { 82 (*output_dims)[outputIndex] = input_dims[i]; 83 ++outputIndex; 84 } 85 } 86 } 87 }; 88 89 template <> struct DimInitializer<Sizes<> > { 90 template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC 91 static void run(const InputDims& input_dims, const array<bool, Rank>&, 92 Sizes<>*, array<Index, Rank>* reduced_dims) { 93 const int NumInputDims = internal::array_size<InputDims>::value; 94 for (int i = 0; i < NumInputDims; ++i) { 95 (*reduced_dims)[i] = input_dims[i]; 96 } 97 } 98 }; 99 100 101 template <typename ReducedDims, int NumTensorDims, int Layout> 102 struct are_inner_most_dims { 103 static const bool value = false; 104 }; 105 template <typename ReducedDims, int NumTensorDims, int Layout> 106 struct preserve_inner_most_dims { 107 static const bool value = false; 108 }; 109 110 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES 111 template <typename ReducedDims, int NumTensorDims> 112 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ 113 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 114 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0); 115 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1); 116 static const bool value = tmp1 & tmp2 & tmp3; 117 }; 118 template <typename ReducedDims, int NumTensorDims> 119 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ 120 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 121 static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value); 122 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); 123 static const bool value = tmp1 & tmp2 & tmp3; 124 125 }; 126 template <typename ReducedDims, int NumTensorDims> 127 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ 128 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 129 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0); 130 static const bool value = tmp1 & tmp2; 131 132 }; 133 template <typename ReducedDims, int NumTensorDims> 134 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ 135 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 136 static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); 137 static const bool value = tmp1 & tmp2; 138 }; 139 #endif 140 141 142 template <int DimIndex, typename Self, typename Op> 143 struct GenericDimReducer { 144 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { 145 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 146 for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { 147 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; 148 GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); 149 } 150 } 151 }; 152 template <typename Self, typename Op> 153 struct GenericDimReducer<0, Self, Op> { 154 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { 155 for (int j = 0; j < self.m_reducedDims[0]; ++j) { 156 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; 157 reducer.reduce(self.m_impl.coeff(input), accum); 158 } 159 } 160 }; 161 template <typename Self, typename Op> 162 struct GenericDimReducer<-1, Self, Op> { 163 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { 164 reducer.reduce(self.m_impl.coeff(index), accum); 165 } 166 }; 167 168 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess), 169 bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && 170 !Self::ReducerTraits::IsExactlyAssociative)> 171 struct InnerMostDimReducer { 172 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { 173 typename Self::CoeffReturnType accum = reducer.initialize(); 174 for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { 175 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 176 } 177 return reducer.finalize(accum); 178 } 179 }; 180 181 template <typename Self, typename Op> 182 struct InnerMostDimReducer<Self, Op, true, false> { 183 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { 184 const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; 185 const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; 186 typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>(); 187 for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { 188 reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum); 189 } 190 typename Self::CoeffReturnType accum = reducer.initialize(); 191 for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { 192 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 193 } 194 return reducer.finalizeBoth(accum, paccum); 195 } 196 }; 197 198 #if !defined(EIGEN_HIPCC) 199 static const int kLeafSize = 1024; 200 201 template <typename Self, typename Op> 202 struct InnerMostDimReducer<Self, Op, false, true> { 203 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType 204 reduce(const Self& self, typename Self::Index firstIndex, 205 typename Self::Index numValuesToReduce, Op& reducer) { 206 typename Self::CoeffReturnType accum = reducer.initialize(); 207 if (numValuesToReduce > kLeafSize) { 208 const typename Self::Index half = numValuesToReduce / 2; 209 reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); 210 reducer.reduce( 211 reduce(self, firstIndex + half, numValuesToReduce - half, reducer), 212 &accum); 213 } else { 214 for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { 215 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 216 } 217 } 218 return reducer.finalize(accum); 219 } 220 }; 221 222 template <typename Self, typename Op> 223 struct InnerMostDimReducer<Self, Op, true, true> { 224 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType 225 reduce(const Self& self, typename Self::Index firstIndex, 226 typename Self::Index numValuesToReduce, Op& reducer) { 227 const typename Self::Index packetSize = 228 internal::unpacket_traits<typename Self::PacketReturnType>::size; 229 typename Self::CoeffReturnType accum = reducer.initialize(); 230 if (numValuesToReduce > packetSize * kLeafSize) { 231 // Make sure the split point is aligned on a packet boundary. 232 const typename Self::Index split = 233 packetSize * 234 divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), 235 packetSize); 236 const typename Self::Index num_left = 237 numext::mini(split - firstIndex, numValuesToReduce); 238 reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); 239 if (num_left < numValuesToReduce) { 240 reducer.reduce( 241 reduce(self, split, numValuesToReduce - num_left, reducer), &accum); 242 } 243 return reducer.finalize(accum); 244 } else { 245 const typename Self::Index UnrollSize = 246 (numValuesToReduce / (2*packetSize)) * 2*packetSize; 247 const typename Self::Index VectorizedSize = 248 (numValuesToReduce / packetSize) * packetSize; 249 typename Self::PacketReturnType paccum = 250 reducer.template initializePacket<typename Self::PacketReturnType>(); 251 typename Self::PacketReturnType paccum2 = 252 reducer.template initializePacket<typename Self::PacketReturnType>(); 253 for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { 254 reducer.reducePacket( 255 self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum); 256 reducer.reducePacket( 257 self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize), 258 &paccum2); 259 } 260 for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { 261 reducer.reducePacket(self.m_impl.template packet<Unaligned>( 262 firstIndex + j), &paccum); 263 } 264 reducer.reducePacket(paccum2, &paccum); 265 for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; 266 ++j) { 267 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 268 } 269 return reducer.finalizeBoth(accum, paccum); 270 } 271 } 272 }; 273 #endif 274 275 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> 276 struct InnerMostDimPreserver { 277 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { 278 eigen_assert(false && "should never be called"); 279 } 280 }; 281 282 template <int DimIndex, typename Self, typename Op> 283 struct InnerMostDimPreserver<DimIndex, Self, Op, true> { 284 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { 285 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 286 for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { 287 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; 288 InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); 289 } 290 } 291 }; 292 293 template <typename Self, typename Op> 294 struct InnerMostDimPreserver<0, Self, Op, true> { 295 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { 296 for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { 297 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; 298 reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum); 299 } 300 } 301 }; 302 template <typename Self, typename Op> 303 struct InnerMostDimPreserver<-1, Self, Op, true> { 304 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { 305 eigen_assert(false && "should never be called"); 306 } 307 }; 308 309 // Default full reducer 310 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> 311 struct FullReducer { 312 static const bool HasOptimizedImplementation = false; 313 314 static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { 315 const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); 316 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); 317 } 318 }; 319 320 321 #ifdef EIGEN_USE_THREADS 322 // Multithreaded full reducers 323 template <typename Self, typename Op, 324 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> 325 struct FullReducerShard { 326 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, 327 typename Self::Index numValuesToReduce, Op& reducer, 328 typename Self::CoeffReturnType* output) { 329 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( 330 self, firstIndex, numValuesToReduce, reducer); 331 } 332 }; 333 334 // Multithreaded full reducer 335 template <typename Self, typename Op, bool Vectorizable> 336 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { 337 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; 338 static const Index PacketSize = 339 unpacket_traits<typename Self::PacketReturnType>::size; 340 341 // launch one reducer per thread and accumulate the result. 342 static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, 343 typename Self::CoeffReturnType* output) { 344 typedef typename Self::Index Index; 345 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 346 if (num_coeffs == 0) { 347 *output = reducer.finalize(reducer.initialize()); 348 return; 349 } 350 const TensorOpCost cost = 351 self.m_impl.costPerCoeff(Vectorizable) + 352 TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, 353 PacketSize); 354 const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( 355 num_coeffs, cost, device.numThreads()); 356 if (num_threads == 1) { 357 *output = 358 InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); 359 return; 360 } 361 const Index blocksize = 362 std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); 363 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; 364 eigen_assert(num_coeffs >= numblocks * blocksize); 365 366 Barrier barrier(internal::convert_index<unsigned int>(numblocks)); 367 MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); 368 for (Index i = 0; i < numblocks; ++i) { 369 device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, 370 self, i * blocksize, blocksize, reducer, 371 &shards[i]); 372 } 373 typename Self::CoeffReturnType finalShard; 374 if (numblocks * blocksize < num_coeffs) { 375 finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( 376 self, numblocks * blocksize, num_coeffs - numblocks * blocksize, 377 reducer); 378 } else { 379 finalShard = reducer.initialize(); 380 } 381 barrier.Wait(); 382 383 for (Index i = 0; i < numblocks; ++i) { 384 reducer.reduce(shards[i], &finalShard); 385 } 386 *output = reducer.finalize(finalShard); 387 } 388 }; 389 390 #endif 391 392 393 // Default inner reducer 394 template <typename Self, typename Op, typename Device> 395 struct InnerReducer { 396 static const bool HasOptimizedImplementation = false; 397 398 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { 399 eigen_assert(false && "Not implemented"); 400 return true; 401 } 402 }; 403 404 // Default outer reducer 405 template <typename Self, typename Op, typename Device> 406 struct OuterReducer { 407 static const bool HasOptimizedImplementation = false; 408 409 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { 410 eigen_assert(false && "Not implemented"); 411 return true; 412 } 413 }; 414 415 #ifdef EIGEN_USE_SYCL 416 // Default Generic reducer 417 template <typename Self, typename Op, typename Device> 418 struct GenericReducer { 419 static const bool HasOptimizedImplementation = false; 420 421 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { 422 eigen_assert(false && "Not implemented"); 423 return true; 424 } 425 }; 426 #endif 427 428 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) 429 template <int B, int N, typename S, typename R, typename I_> 430 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); 431 432 433 #if defined(EIGEN_HAS_GPU_FP16) 434 template <typename S, typename R, typename I_> 435 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*); 436 template <int B, int N, typename S, typename R, typename I_> 437 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*); 438 template <int NPT, typename S, typename R, typename I_> 439 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); 440 441 #endif 442 443 template <int NPT, typename S, typename R, typename I_> 444 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); 445 446 template <int NPT, typename S, typename R, typename I_> 447 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); 448 #endif 449 450 /** 451 * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. 452 * This allows the reduction to have a different type for the accumulator than the input data type. 453 * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input 454 * with the accumulator and the other for reducing two accumulators. 455 * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for 456 * some properties of the input. 457 */ 458 template <typename Op, typename CoeffReturnType> 459 struct ReductionReturnType { 460 #if defined(EIGEN_USE_SYCL) 461 typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type; 462 #else 463 typedef typename remove_const<CoeffReturnType>::type type; 464 #endif 465 }; 466 467 } // end namespace internal 468 469 470 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 471 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> { 472 public: 473 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar; 474 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 475 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; 476 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested; 477 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind; 478 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index; 479 480 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 481 TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) 482 { } 483 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 484 TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) 485 { } 486 487 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 488 const XprType& expression() const { return m_expr; } 489 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 490 const Dims& dims() const { return m_dims; } 491 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 492 const Op& reducer() const { return m_reducer; } 493 494 protected: 495 typename XprType::Nested m_expr; 496 const Dims m_dims; 497 const Op m_reducer; 498 }; 499 500 template<typename ArgType, typename Device> 501 struct TensorReductionEvaluatorBase; 502 503 // Eval as rvalue 504 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> 505 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> 506 { 507 typedef internal::reducer_traits<Op, Device> ReducerTraits; 508 typedef Dims ReducedDims; 509 typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType; 510 typedef typename XprType::Index Index; 511 typedef ArgType ChildType; 512 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; 513 static const int NumInputDims = internal::array_size<InputDimensions>::value; 514 static const int NumReducedDims = internal::array_size<Dims>::value; 515 static const int NumOutputDims = NumInputDims - NumReducedDims; 516 typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions; 517 typedef typename XprType::Scalar Scalar; 518 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self; 519 static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; 520 typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType; 521 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 522 static const Index PacketSize = PacketType<CoeffReturnType, Device>::size; 523 524 typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; 525 typedef StorageMemory<CoeffReturnType, Device> Storage; 526 typedef typename Storage::Type EvaluatorPointerType; 527 528 // Subset of strides of the input tensor for the non-reduced dimensions. 529 // Indexed by output dimensions. 530 static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; 531 532 enum { 533 IsAligned = false, 534 PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, 535 BlockAccess = false, 536 PreferBlockAccess = true, 537 Layout = TensorEvaluator<ArgType, Device>::Layout, 538 CoordAccess = false, // to be implemented 539 RawAccess = false 540 }; 541 542 typedef typename internal::remove_const<Scalar>::type ScalarNoConst; 543 544 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 545 typedef internal::TensorBlockNotImplemented TensorBlock; 546 //===--------------------------------------------------------------------===// 547 548 static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; 549 static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; 550 static const bool RunningFullReduction = (NumOutputDims==0); 551 552 EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) 553 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) 554 { 555 EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); 556 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), 557 YOU_MADE_A_PROGRAMMING_MISTAKE); 558 559 // Build the bitmap indicating if an input dimension is reduced or not. 560 for (int i = 0; i < NumInputDims; ++i) { 561 m_reduced[i] = false; 562 } 563 for (int i = 0; i < NumReducedDims; ++i) { 564 eigen_assert(op.dims()[i] >= 0); 565 eigen_assert(op.dims()[i] < NumInputDims); 566 m_reduced[op.dims()[i]] = true; 567 } 568 569 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); 570 internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); 571 572 // Precompute output strides. 573 if (NumOutputDims > 0) { 574 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 575 m_outputStrides[0] = 1; 576 for (int i = 1; i < NumOutputDims; ++i) { 577 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; 578 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); 579 } 580 } else { 581 m_outputStrides[NumOutputDims - 1] = 1; 582 for (int i = NumOutputDims - 2; i >= 0; --i) { 583 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; 584 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); 585 } 586 } 587 } 588 589 // Precompute input strides. 590 if (NumInputDims > 0) { 591 array<Index, NumInputDims> input_strides; 592 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 593 input_strides[0] = 1; 594 for (int i = 1; i < NumInputDims; ++i) { 595 input_strides[i] = input_strides[i-1] * input_dims[i-1]; 596 } 597 } else { 598 input_strides.back() = 1; 599 for (int i = NumInputDims - 2; i >= 0; --i) { 600 input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; 601 } 602 } 603 604 int outputIndex = 0; 605 int reduceIndex = 0; 606 for (int i = 0; i < NumInputDims; ++i) { 607 if (m_reduced[i]) { 608 m_reducedStrides[reduceIndex] = input_strides[i]; 609 ++reduceIndex; 610 } else { 611 m_preservedStrides[outputIndex] = input_strides[i]; 612 m_output_to_input_dim_map[outputIndex] = i; 613 ++outputIndex; 614 } 615 } 616 } 617 618 // Special case for full reductions 619 if (NumOutputDims == 0) { 620 m_preservedStrides[0] = internal::array_prod(input_dims); 621 } 622 623 m_numValuesToReduce = 624 NumOutputDims == 0 625 ? internal::array_prod(input_dims) 626 : (static_cast<int>(Layout) == static_cast<int>(ColMajor)) 627 ? m_preservedStrides[0] 628 : m_preservedStrides[NumOutputDims - 1]; 629 } 630 631 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 632 633 EIGEN_STRONG_INLINE 634 bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { 635 // Use the FullReducer if possible. 636 if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && 637 internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && 638 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || 639 !RunningOnGPU))) { 640 bool need_assign = false; 641 if (!data) { 642 m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); 643 data = m_result; 644 need_assign = true; 645 } 646 Op reducer(m_reducer); 647 internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data); 648 return need_assign; 649 } 650 651 // Attempt to use an optimized reduction. 652 else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { 653 bool reducing_inner_dims = true; 654 for (int i = 0; i < NumReducedDims; ++i) { 655 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 656 reducing_inner_dims &= m_reduced[i]; 657 } else { 658 reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; 659 } 660 } 661 if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation && 662 (reducing_inner_dims || ReducingInnerMostDims)) { 663 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 664 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 665 if (!data) { 666 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { 667 data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); 668 m_result = data; 669 } 670 else { 671 return true; 672 } 673 } 674 Op reducer(m_reducer); 675 // For SYCL this if always return false 676 if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { 677 if (m_result) { 678 m_device.deallocate_temp(m_result); 679 m_result = NULL; 680 } 681 return true; 682 } else { 683 return (m_result != NULL); 684 } 685 } 686 687 bool preserving_inner_dims = true; 688 for (int i = 0; i < NumReducedDims; ++i) { 689 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 690 preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; 691 } else { 692 preserving_inner_dims &= m_reduced[i]; 693 } 694 } 695 if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && 696 preserving_inner_dims) { 697 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 698 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 699 if (!data) { 700 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { 701 data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); 702 m_result = data; 703 } 704 else { 705 return true; 706 } 707 } 708 Op reducer(m_reducer); 709 // For SYCL this if always return false 710 if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { 711 if (m_result) { 712 m_device.deallocate_temp(m_result); 713 m_result = NULL; 714 } 715 return true; 716 } else { 717 return (m_result != NULL); 718 } 719 } 720 #if defined(EIGEN_USE_SYCL) 721 // If there is no Optimised version for SYCL, the reduction expression 722 // must break into two subexpression and use the SYCL generic Reducer on the device. 723 if(RunningOnSycl) { 724 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 725 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 726 if (!data) { 727 data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); 728 m_result = data; 729 } 730 Op reducer(m_reducer); 731 internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); 732 return (m_result != NULL); 733 } 734 #endif 735 } 736 return true; 737 } 738 739 #ifdef EIGEN_USE_THREADS 740 template <typename EvalSubExprsCallback> 741 EIGEN_STRONG_INLINE 742 void 743 evalSubExprsIfNeededAsync(EvaluatorPointerType data, 744 EvalSubExprsCallback done) { 745 m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { 746 done(evalSubExprsIfNeededCommon(data)); 747 }); 748 } 749 #endif 750 751 EIGEN_STRONG_INLINE 752 bool evalSubExprsIfNeeded(EvaluatorPointerType data) { 753 m_impl.evalSubExprsIfNeeded(NULL); 754 return evalSubExprsIfNeededCommon(data); 755 } 756 757 EIGEN_STRONG_INLINE void cleanup() { 758 m_impl.cleanup(); 759 if (m_result) { 760 m_device.deallocate_temp(m_result); 761 m_result = NULL; 762 } 763 } 764 765 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 766 { 767 if (( RunningFullReduction || RunningOnGPU) && m_result ) { 768 return *(m_result + index); 769 } 770 Op reducer(m_reducer); 771 if (ReducingInnerMostDims || RunningFullReduction) { 772 const Index num_values_to_reduce = 773 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; 774 return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), 775 num_values_to_reduce, reducer); 776 } else { 777 typename Self::CoeffReturnType accum = reducer.initialize(); 778 internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum); 779 return reducer.finalize(accum); 780 } 781 } 782 783 // TODO(bsteiner): provide a more efficient implementation. 784 template<int LoadMode> 785 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const 786 { 787 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 788 eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); 789 790 if (RunningOnGPU && m_result) { 791 return internal::pload<PacketReturnType>(m_result + index); 792 } 793 794 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 795 if (ReducingInnerMostDims) { 796 const Index num_values_to_reduce = 797 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; 798 const Index firstIndex = firstInput(index); 799 for (Index i = 0; i < PacketSize; ++i) { 800 Op reducer(m_reducer); 801 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, 802 num_values_to_reduce, reducer); 803 } 804 } else if (PreservingInnerMostDims) { 805 const Index firstIndex = firstInput(index); 806 const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; 807 // TBD: extend this the the n innermost dimensions that we preserve. 808 if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { 809 Op reducer(m_reducer); 810 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); 811 internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); 812 return reducer.finalizePacket(accum); 813 } else { 814 for (int i = 0; i < PacketSize; ++i) { 815 values[i] = coeff(index + i); 816 } 817 } 818 } else { 819 for (int i = 0; i < PacketSize; ++i) { 820 values[i] = coeff(index + i); 821 } 822 } 823 PacketReturnType rslt = internal::pload<PacketReturnType>(values); 824 return rslt; 825 } 826 827 // Must be called after evalSubExprsIfNeeded(). 828 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { 829 if (RunningFullReduction && m_result) { 830 return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); 831 } else { 832 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 833 const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost; 834 return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + 835 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); 836 } 837 } 838 839 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } 840 EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } 841 EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } 842 #ifdef EIGEN_USE_SYCL 843 // binding placeholder accessors to a command group handler for SYCL 844 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { 845 m_impl.bind(cgh); 846 m_result.bind(cgh); 847 } 848 #endif 849 850 private: 851 template <int, typename, typename> friend struct internal::GenericDimReducer; 852 template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer; 853 template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver; 854 template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer; 855 #ifdef EIGEN_USE_THREADS 856 template <typename S, typename O, bool V> friend struct internal::FullReducerShard; 857 #endif 858 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) 859 template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); 860 #if defined(EIGEN_HAS_GPU_FP16) 861 template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*); 862 template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*); 863 template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); 864 #endif 865 template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); 866 867 template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); 868 #endif 869 870 #if defined(EIGEN_USE_SYCL) 871 template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; 872 // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer 873 template <typename, typename, typename> friend struct internal::GenericReducer; 874 #endif 875 876 877 template <typename S, typename O, typename D> friend struct internal::InnerReducer; 878 879 struct BlockIteratorState { 880 Index input_dim; 881 Index output_size; 882 Index output_count; 883 }; 884 885 // Returns the Index in the input tensor of the first value that needs to be 886 // used to compute the reduction at output index "index". 887 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { 888 if (ReducingInnerMostDims) { 889 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 890 return index * m_preservedStrides[0]; 891 } else { 892 return index * m_preservedStrides[NumPreservedStrides - 1]; 893 } 894 } 895 // TBD: optimize the case where we preserve the innermost dimensions. 896 Index startInput = 0; 897 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 898 for (int i = NumOutputDims - 1; i > 0; --i) { 899 // This is index_i in the output tensor. 900 const Index idx = index / m_outputStrides[i]; 901 startInput += idx * m_preservedStrides[i]; 902 index -= idx * m_outputStrides[i]; 903 } 904 if (PreservingInnerMostDims) { 905 eigen_assert(m_preservedStrides[0] == 1); 906 startInput += index; 907 } else { 908 startInput += index * m_preservedStrides[0]; 909 } 910 } else { 911 for (int i = 0; i < NumOutputDims - 1; ++i) { 912 // This is index_i in the output tensor. 913 const Index idx = index / m_outputStrides[i]; 914 startInput += idx * m_preservedStrides[i]; 915 index -= idx * m_outputStrides[i]; 916 } 917 if (PreservingInnerMostDims) { 918 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); 919 startInput += index; 920 } else { 921 startInput += index * m_preservedStrides[NumPreservedStrides - 1]; 922 } 923 } 924 return startInput; 925 } 926 927 // Bitmap indicating if an input dimension is reduced or not. 928 array<bool, NumInputDims> m_reduced; 929 // Dimensions of the output of the operation. 930 Dimensions m_dimensions; 931 // Precomputed strides for the output tensor. 932 array<Index, NumOutputDims> m_outputStrides; 933 array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides; 934 array<Index, NumPreservedStrides> m_preservedStrides; 935 // Map from output to input dimension index. 936 array<Index, NumOutputDims> m_output_to_input_dim_map; 937 // How many values go into each reduction 938 Index m_numValuesToReduce; 939 940 // Subset of strides of the input tensor for the reduced dimensions. 941 // Indexed by reduced dimensions. 942 array<Index, NumReducedDims> m_reducedStrides; 943 // Size of the input dimensions that are reduced. 944 // Indexed by reduced dimensions. 945 array<Index, NumReducedDims> m_reducedDims; 946 947 // Evaluator for the input expression. 948 TensorEvaluator<ArgType, Device> m_impl; 949 950 // Operation to apply for computing the reduction. 951 Op m_reducer; 952 953 // For full reductions 954 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) 955 static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; 956 static const bool RunningOnSycl = false; 957 #elif defined(EIGEN_USE_SYCL) 958 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value; 959 static const bool RunningOnGPU = false; 960 #else 961 static const bool RunningOnGPU = false; 962 static const bool RunningOnSycl = false; 963 #endif 964 EvaluatorPointerType m_result; 965 966 const Device EIGEN_DEVICE_REF m_device; 967 }; 968 969 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> 970 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> 971 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> { 972 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base; 973 EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} 974 }; 975 976 977 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_> 978 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> 979 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> { 980 981 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base; 982 EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} 983 // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel 984 //Therefore the coeff function should be overridden by for SYCL kernel 985 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { 986 return *(this->data() + index); 987 } 988 // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel 989 //Therefore the packet function should be overridden by for SYCL kernel 990 template<int LoadMode> 991 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { 992 return internal::pload<typename Base::PacketReturnType>(this->data() + index); 993 } 994 }; 995 996 } // end namespace Eigen 997 998 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H 999