xref: /aosp_15_r20/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 template <typename Op, typename XprType>
18 struct traits<TensorScanOp<Op, XprType> >
19     : public traits<XprType> {
20   typedef typename XprType::Scalar Scalar;
21   typedef traits<XprType> XprTraits;
22   typedef typename XprTraits::StorageKind StorageKind;
23   typedef typename XprType::Nested Nested;
24   typedef typename remove_reference<Nested>::type _Nested;
25   static const int NumDimensions = XprTraits::NumDimensions;
26   static const int Layout = XprTraits::Layout;
27   typedef typename XprTraits::PointerType PointerType;
28 };
29 
30 template<typename Op, typename XprType>
31 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
32 {
33   typedef const TensorScanOp<Op, XprType>& type;
34 };
35 
36 template<typename Op, typename XprType>
37 struct nested<TensorScanOp<Op, XprType>, 1,
38             typename eval<TensorScanOp<Op, XprType> >::type>
39 {
40   typedef TensorScanOp<Op, XprType> type;
41 };
42 } // end namespace internal
43 
44 /** \class TensorScan
45   * \ingroup CXX11_Tensor_Module
46   *
47   * \brief Tensor scan class.
48   */
49 template <typename Op, typename XprType>
50 class TensorScanOp
51     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
52 public:
53   typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
54   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
55   typedef typename XprType::CoeffReturnType CoeffReturnType;
56   typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
57   typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
58   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
59 
60   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
61       const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
62       : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
63 
64   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
65   const Index axis() const { return m_axis; }
66   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
67   const XprType& expression() const { return m_expr; }
68   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
69   const Op accumulator() const { return m_accumulator; }
70   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
71   bool exclusive() const { return m_exclusive; }
72 
73 protected:
74   typename XprType::Nested m_expr;
75   const Index m_axis;
76   const Op m_accumulator;
77   const bool m_exclusive;
78 };
79 
80 
81 namespace internal {
82 
83 template <typename Self>
84 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
85                                       typename Self::CoeffReturnType* data) {
86   // Compute the scan along the axis, starting at the given offset
87   typename Self::CoeffReturnType accum = self.accumulator().initialize();
88   if (self.stride() == 1) {
89     if (self.exclusive()) {
90       for (Index curr = offset; curr < offset + self.size(); ++curr) {
91         data[curr] = self.accumulator().finalize(accum);
92         self.accumulator().reduce(self.inner().coeff(curr), &accum);
93       }
94     } else {
95       for (Index curr = offset; curr < offset + self.size(); ++curr) {
96         self.accumulator().reduce(self.inner().coeff(curr), &accum);
97         data[curr] = self.accumulator().finalize(accum);
98       }
99     }
100   } else {
101     if (self.exclusive()) {
102       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
103         Index curr = offset + idx3 * self.stride();
104         data[curr] = self.accumulator().finalize(accum);
105         self.accumulator().reduce(self.inner().coeff(curr), &accum);
106       }
107     } else {
108       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
109         Index curr = offset + idx3 * self.stride();
110         self.accumulator().reduce(self.inner().coeff(curr), &accum);
111         data[curr] = self.accumulator().finalize(accum);
112       }
113     }
114   }
115 }
116 
117 template <typename Self>
118 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
119                                       typename Self::CoeffReturnType* data) {
120   using Scalar = typename Self::CoeffReturnType;
121   using Packet = typename Self::PacketReturnType;
122   // Compute the scan along the axis, starting at the calculated offset
123   Packet accum = self.accumulator().template initializePacket<Packet>();
124   if (self.stride() == 1) {
125     if (self.exclusive()) {
126       for (Index curr = offset; curr < offset + self.size(); ++curr) {
127         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
128         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
129       }
130     } else {
131       for (Index curr = offset; curr < offset + self.size(); ++curr) {
132         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
133         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
134       }
135     }
136   } else {
137     if (self.exclusive()) {
138       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
139         const Index curr = offset + idx3 * self.stride();
140         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
141         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
142       }
143     } else {
144       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
145         const Index curr = offset + idx3 * self.stride();
146         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
147         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
148       }
149     }
150   }
151 }
152 
153 template <typename Self, bool Vectorize, bool Parallel>
154 struct ReduceBlock {
155   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
156                                       typename Self::CoeffReturnType* data) {
157     for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
158       // Calculate the starting offset for the scan
159       Index offset = idx1 + idx2;
160       ReduceScalar(self, offset, data);
161     }
162   }
163 };
164 
165 // Specialization for vectorized reduction.
166 template <typename Self>
167 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
168   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
169                                       typename Self::CoeffReturnType* data) {
170     using Packet = typename Self::PacketReturnType;
171     const int PacketSize = internal::unpacket_traits<Packet>::size;
172     Index idx2 = 0;
173     for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
174       // Calculate the starting offset for the packet scan
175       Index offset = idx1 + idx2;
176       ReducePacket(self, offset, data);
177     }
178     for (; idx2 < self.stride(); idx2++) {
179       // Calculate the starting offset for the scan
180       Index offset = idx1 + idx2;
181       ReduceScalar(self, offset, data);
182     }
183   }
184 };
185 
186 // Single-threaded CPU implementation of scan
187 template <typename Self, typename Reducer, typename Device,
188           bool Vectorize =
189               (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
190                internal::reducer_traits<Reducer, Device>::PacketAccess)>
191 struct ScanLauncher {
192   void operator()(Self& self, typename Self::CoeffReturnType* data) {
193     Index total_size = internal::array_prod(self.dimensions());
194 
195     // We fix the index along the scan axis to 0 and perform a
196     // scan per remaining entry. The iteration is split into two nested
197     // loops to avoid an integer division by keeping track of each idx1 and
198     // idx2.
199     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
200       ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
201       block_reducer(self, idx1, data);
202     }
203   }
204 };
205 
206 #ifdef EIGEN_USE_THREADS
207 
208 // Adjust block_size to avoid false sharing of cachelines among
209 // threads. Currently set to twice the cache line size on Intel and ARM
210 // processors.
211 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
212   EIGEN_CONSTEXPR Index kBlockAlignment = 128;
213   const Index items_per_cacheline =
214       numext::maxi<Index>(1, kBlockAlignment / item_size);
215   return items_per_cacheline * divup(block_size, items_per_cacheline);
216 }
217 
218 template <typename Self>
219 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
220   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
221                                       typename Self::CoeffReturnType* data) {
222     using Scalar = typename Self::CoeffReturnType;
223     using Packet = typename Self::PacketReturnType;
224     const int PacketSize = internal::unpacket_traits<Packet>::size;
225     Index num_scalars = self.stride();
226     Index num_packets = 0;
227     if (self.stride() >= PacketSize) {
228       num_packets = self.stride() / PacketSize;
229       self.device().parallelFor(
230           num_packets,
231         TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
232                      16 * PacketSize * self.size(), true, PacketSize),
233         // Make the shard size large enough that two neighboring threads
234         // won't write to the same cacheline of `data`.
235         [=](Index blk_size) {
236           return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
237         },
238         [&](Index first, Index last) {
239           for (Index packet = first; packet < last; ++packet) {
240             const Index idx2 = packet * PacketSize;
241             ReducePacket(self, idx1 + idx2, data);
242           }
243         });
244       num_scalars -= num_packets * PacketSize;
245     }
246     self.device().parallelFor(
247         num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
248         // Make the shard size large enough that two neighboring threads
249         // won't write to the same cacheline of `data`.
250         [=](Index blk_size) {
251           return AdjustBlockSize(sizeof(Scalar), blk_size);
252         },
253         [&](Index first, Index last) {
254           for (Index scalar = first; scalar < last; ++scalar) {
255             const Index idx2 = num_packets * PacketSize + scalar;
256             ReduceScalar(self, idx1 + idx2, data);
257           }
258         });
259   }
260 };
261 
262 template <typename Self>
263 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
264   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
265                                       typename Self::CoeffReturnType* data) {
266     using Scalar = typename Self::CoeffReturnType;
267     self.device().parallelFor(
268         self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
269         // Make the shard size large enough that two neighboring threads
270         // won't write to the same cacheline of `data`.
271         [=](Index blk_size) {
272           return AdjustBlockSize(sizeof(Scalar), blk_size);
273         },
274         [&](Index first, Index last) {
275           for (Index idx2 = first; idx2 < last; ++idx2) {
276             ReduceScalar(self, idx1 + idx2, data);
277           }
278         });
279   }
280 };
281 
282 // Specialization for multi-threaded execution.
283 template <typename Self, typename Reducer, bool Vectorize>
284 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
285   void operator()(Self& self, typename Self::CoeffReturnType* data) {
286     using Scalar = typename Self::CoeffReturnType;
287     using Packet = typename Self::PacketReturnType;
288     const int PacketSize = internal::unpacket_traits<Packet>::size;
289     const Index total_size = internal::array_prod(self.dimensions());
290     const Index inner_block_size = self.stride() * self.size();
291     bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
292 
293     if ((parallelize_by_outer_blocks && total_size <= 4096) ||
294         (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
295       ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
296       launcher(self, data);
297       return;
298     }
299 
300     if (parallelize_by_outer_blocks) {
301       // Parallelize over outer blocks.
302       const Index num_outer_blocks = total_size / inner_block_size;
303       self.device().parallelFor(
304           num_outer_blocks,
305           TensorOpCost(inner_block_size, inner_block_size,
306                        16 * PacketSize * inner_block_size, Vectorize,
307                        PacketSize),
308           [=](Index blk_size) {
309             return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
310           },
311           [&](Index first, Index last) {
312             for (Index idx1 = first; idx1 < last; ++idx1) {
313               ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
314               block_reducer(self, idx1 * inner_block_size, data);
315             }
316           });
317     } else {
318       // Parallelize over inner packets/scalars dimensions when the reduction
319       // axis is not an inner dimension.
320       ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
321       for (Index idx1 = 0; idx1 < total_size;
322            idx1 += self.stride() * self.size()) {
323         block_reducer(self, idx1, data);
324       }
325     }
326   }
327 };
328 #endif  // EIGEN_USE_THREADS
329 
330 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
331 
332 // GPU implementation of scan
333 // TODO(ibab) This placeholder implementation performs multiple scans in
334 // parallel, but it would be better to use a parallel scan algorithm and
335 // optimize memory access.
336 template <typename Self, typename Reducer>
337 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
338   // Compute offset as in the CPU version
339   Index val = threadIdx.x + blockIdx.x * blockDim.x;
340   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
341 
342   if (offset + (self.size() - 1) * self.stride() < total_size) {
343     // Compute the scan along the axis, starting at the calculated offset
344     typename Self::CoeffReturnType accum = self.accumulator().initialize();
345     for (Index idx = 0; idx < self.size(); idx++) {
346       Index curr = offset + idx * self.stride();
347       if (self.exclusive()) {
348         data[curr] = self.accumulator().finalize(accum);
349         self.accumulator().reduce(self.inner().coeff(curr), &accum);
350       } else {
351         self.accumulator().reduce(self.inner().coeff(curr), &accum);
352         data[curr] = self.accumulator().finalize(accum);
353       }
354     }
355   }
356   __syncthreads();
357 
358 }
359 
360 template <typename Self, typename Reducer, bool Vectorize>
361 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
362   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
363      Index total_size = internal::array_prod(self.dimensions());
364      Index num_blocks = (total_size / self.size() + 63) / 64;
365      Index block_size = 64;
366 
367      LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
368   }
369 };
370 #endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
371 
372 }  // namespace internal
373 
374 // Eval as rvalue
375 template <typename Op, typename ArgType, typename Device>
376 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
377 
378   typedef TensorScanOp<Op, ArgType> XprType;
379   typedef typename XprType::Index Index;
380   typedef const ArgType ChildTypeNoConst;
381   typedef const ArgType ChildType;
382   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
383   typedef DSizes<Index, NumDims> Dimensions;
384   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
385   typedef typename XprType::CoeffReturnType CoeffReturnType;
386   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
387   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
388   typedef StorageMemory<Scalar, Device> Storage;
389   typedef typename Storage::Type EvaluatorPointerType;
390 
391   enum {
392     IsAligned = false,
393     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
394     BlockAccess = false,
395     PreferBlockAccess = false,
396     Layout = TensorEvaluator<ArgType, Device>::Layout,
397     CoordAccess = false,
398     RawAccess = true
399   };
400 
401   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
402   typedef internal::TensorBlockNotImplemented TensorBlock;
403   //===--------------------------------------------------------------------===//
404 
405   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
406       : m_impl(op.expression(), device),
407         m_device(device),
408         m_exclusive(op.exclusive()),
409         m_accumulator(op.accumulator()),
410         m_size(m_impl.dimensions()[op.axis()]),
411         m_stride(1), m_consume_dim(op.axis()),
412         m_output(NULL) {
413 
414     // Accumulating a scalar isn't supported.
415     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
416     eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
417 
418     // Compute stride of scan axis
419     const Dimensions& dims = m_impl.dimensions();
420     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
421       for (int i = 0; i < op.axis(); ++i) {
422         m_stride = m_stride * dims[i];
423       }
424     } else {
425       // dims can only be indexed through unsigned integers,
426       // so let's use an unsigned type to let the compiler knows.
427       // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
428       unsigned int axis = internal::convert_index<unsigned int>(op.axis());
429       for (unsigned int i = NumDims - 1; i > axis; --i) {
430         m_stride = m_stride * dims[i];
431       }
432     }
433   }
434 
435   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
436     return m_impl.dimensions();
437   }
438 
439   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
440     return m_stride;
441   }
442 
443   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
444     return m_consume_dim;
445   }
446 
447   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
448     return m_size;
449   }
450 
451   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
452     return m_accumulator;
453   }
454 
455   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
456     return m_exclusive;
457   }
458 
459   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
460     return m_impl;
461   }
462 
463   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
464     return m_device;
465   }
466 
467   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
468     m_impl.evalSubExprsIfNeeded(NULL);
469     internal::ScanLauncher<Self, Op, Device> launcher;
470     if (data) {
471       launcher(*this, data);
472       return false;
473     }
474 
475     const Index total_size = internal::array_prod(dimensions());
476     m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
477     launcher(*this, m_output);
478     return true;
479   }
480 
481   template<int LoadMode>
482   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
483     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
484   }
485 
486   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
487   {
488     return m_output;
489   }
490 
491   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
492   {
493     return m_output[index];
494   }
495 
496   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
497     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
498   }
499 
500   EIGEN_STRONG_INLINE void cleanup() {
501     if (m_output) {
502       m_device.deallocate_temp(m_output);
503       m_output = NULL;
504     }
505     m_impl.cleanup();
506   }
507 
508 #ifdef EIGEN_USE_SYCL
509  // binding placeholder accessors to a command group handler for SYCL
510   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
511     m_impl.bind(cgh);
512     m_output.bind(cgh);
513   }
514 #endif
515 protected:
516   TensorEvaluator<ArgType, Device> m_impl;
517   const Device EIGEN_DEVICE_REF m_device;
518   const bool m_exclusive;
519   Op m_accumulator;
520   const Index m_size;
521   Index m_stride;
522   Index m_consume_dim;
523   EvaluatorPointerType m_output;
524 };
525 
526 }  // end namespace Eigen
527 
528 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
529