1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Igor Babuschkin <[email protected]> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 template <typename Op, typename XprType> 18 struct traits<TensorScanOp<Op, XprType> > 19 : public traits<XprType> { 20 typedef typename XprType::Scalar Scalar; 21 typedef traits<XprType> XprTraits; 22 typedef typename XprTraits::StorageKind StorageKind; 23 typedef typename XprType::Nested Nested; 24 typedef typename remove_reference<Nested>::type _Nested; 25 static const int NumDimensions = XprTraits::NumDimensions; 26 static const int Layout = XprTraits::Layout; 27 typedef typename XprTraits::PointerType PointerType; 28 }; 29 30 template<typename Op, typename XprType> 31 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> 32 { 33 typedef const TensorScanOp<Op, XprType>& type; 34 }; 35 36 template<typename Op, typename XprType> 37 struct nested<TensorScanOp<Op, XprType>, 1, 38 typename eval<TensorScanOp<Op, XprType> >::type> 39 { 40 typedef TensorScanOp<Op, XprType> type; 41 }; 42 } // end namespace internal 43 44 /** \class TensorScan 45 * \ingroup CXX11_Tensor_Module 46 * 47 * \brief Tensor scan class. 48 */ 49 template <typename Op, typename XprType> 50 class TensorScanOp 51 : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { 52 public: 53 typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar; 54 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 55 typedef typename XprType::CoeffReturnType CoeffReturnType; 56 typedef typename Eigen::internal::nested<TensorScanOp>::type Nested; 57 typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind; 58 typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; 59 60 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( 61 const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) 62 : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} 63 64 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 65 const Index axis() const { return m_axis; } 66 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 67 const XprType& expression() const { return m_expr; } 68 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 69 const Op accumulator() const { return m_accumulator; } 70 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 71 bool exclusive() const { return m_exclusive; } 72 73 protected: 74 typename XprType::Nested m_expr; 75 const Index m_axis; 76 const Op m_accumulator; 77 const bool m_exclusive; 78 }; 79 80 81 namespace internal { 82 83 template <typename Self> 84 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, 85 typename Self::CoeffReturnType* data) { 86 // Compute the scan along the axis, starting at the given offset 87 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 88 if (self.stride() == 1) { 89 if (self.exclusive()) { 90 for (Index curr = offset; curr < offset + self.size(); ++curr) { 91 data[curr] = self.accumulator().finalize(accum); 92 self.accumulator().reduce(self.inner().coeff(curr), &accum); 93 } 94 } else { 95 for (Index curr = offset; curr < offset + self.size(); ++curr) { 96 self.accumulator().reduce(self.inner().coeff(curr), &accum); 97 data[curr] = self.accumulator().finalize(accum); 98 } 99 } 100 } else { 101 if (self.exclusive()) { 102 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 103 Index curr = offset + idx3 * self.stride(); 104 data[curr] = self.accumulator().finalize(accum); 105 self.accumulator().reduce(self.inner().coeff(curr), &accum); 106 } 107 } else { 108 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 109 Index curr = offset + idx3 * self.stride(); 110 self.accumulator().reduce(self.inner().coeff(curr), &accum); 111 data[curr] = self.accumulator().finalize(accum); 112 } 113 } 114 } 115 } 116 117 template <typename Self> 118 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, 119 typename Self::CoeffReturnType* data) { 120 using Scalar = typename Self::CoeffReturnType; 121 using Packet = typename Self::PacketReturnType; 122 // Compute the scan along the axis, starting at the calculated offset 123 Packet accum = self.accumulator().template initializePacket<Packet>(); 124 if (self.stride() == 1) { 125 if (self.exclusive()) { 126 for (Index curr = offset; curr < offset + self.size(); ++curr) { 127 internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); 128 self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); 129 } 130 } else { 131 for (Index curr = offset; curr < offset + self.size(); ++curr) { 132 self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); 133 internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); 134 } 135 } 136 } else { 137 if (self.exclusive()) { 138 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 139 const Index curr = offset + idx3 * self.stride(); 140 internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); 141 self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); 142 } 143 } else { 144 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 145 const Index curr = offset + idx3 * self.stride(); 146 self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); 147 internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); 148 } 149 } 150 } 151 } 152 153 template <typename Self, bool Vectorize, bool Parallel> 154 struct ReduceBlock { 155 EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, 156 typename Self::CoeffReturnType* data) { 157 for (Index idx2 = 0; idx2 < self.stride(); idx2++) { 158 // Calculate the starting offset for the scan 159 Index offset = idx1 + idx2; 160 ReduceScalar(self, offset, data); 161 } 162 } 163 }; 164 165 // Specialization for vectorized reduction. 166 template <typename Self> 167 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> { 168 EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, 169 typename Self::CoeffReturnType* data) { 170 using Packet = typename Self::PacketReturnType; 171 const int PacketSize = internal::unpacket_traits<Packet>::size; 172 Index idx2 = 0; 173 for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) { 174 // Calculate the starting offset for the packet scan 175 Index offset = idx1 + idx2; 176 ReducePacket(self, offset, data); 177 } 178 for (; idx2 < self.stride(); idx2++) { 179 // Calculate the starting offset for the scan 180 Index offset = idx1 + idx2; 181 ReduceScalar(self, offset, data); 182 } 183 } 184 }; 185 186 // Single-threaded CPU implementation of scan 187 template <typename Self, typename Reducer, typename Device, 188 bool Vectorize = 189 (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess && 190 internal::reducer_traits<Reducer, Device>::PacketAccess)> 191 struct ScanLauncher { 192 void operator()(Self& self, typename Self::CoeffReturnType* data) { 193 Index total_size = internal::array_prod(self.dimensions()); 194 195 // We fix the index along the scan axis to 0 and perform a 196 // scan per remaining entry. The iteration is split into two nested 197 // loops to avoid an integer division by keeping track of each idx1 and 198 // idx2. 199 for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { 200 ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer; 201 block_reducer(self, idx1, data); 202 } 203 } 204 }; 205 206 #ifdef EIGEN_USE_THREADS 207 208 // Adjust block_size to avoid false sharing of cachelines among 209 // threads. Currently set to twice the cache line size on Intel and ARM 210 // processors. 211 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { 212 EIGEN_CONSTEXPR Index kBlockAlignment = 128; 213 const Index items_per_cacheline = 214 numext::maxi<Index>(1, kBlockAlignment / item_size); 215 return items_per_cacheline * divup(block_size, items_per_cacheline); 216 } 217 218 template <typename Self> 219 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> { 220 EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, 221 typename Self::CoeffReturnType* data) { 222 using Scalar = typename Self::CoeffReturnType; 223 using Packet = typename Self::PacketReturnType; 224 const int PacketSize = internal::unpacket_traits<Packet>::size; 225 Index num_scalars = self.stride(); 226 Index num_packets = 0; 227 if (self.stride() >= PacketSize) { 228 num_packets = self.stride() / PacketSize; 229 self.device().parallelFor( 230 num_packets, 231 TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 232 16 * PacketSize * self.size(), true, PacketSize), 233 // Make the shard size large enough that two neighboring threads 234 // won't write to the same cacheline of `data`. 235 [=](Index blk_size) { 236 return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); 237 }, 238 [&](Index first, Index last) { 239 for (Index packet = first; packet < last; ++packet) { 240 const Index idx2 = packet * PacketSize; 241 ReducePacket(self, idx1 + idx2, data); 242 } 243 }); 244 num_scalars -= num_packets * PacketSize; 245 } 246 self.device().parallelFor( 247 num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), 248 // Make the shard size large enough that two neighboring threads 249 // won't write to the same cacheline of `data`. 250 [=](Index blk_size) { 251 return AdjustBlockSize(sizeof(Scalar), blk_size); 252 }, 253 [&](Index first, Index last) { 254 for (Index scalar = first; scalar < last; ++scalar) { 255 const Index idx2 = num_packets * PacketSize + scalar; 256 ReduceScalar(self, idx1 + idx2, data); 257 } 258 }); 259 } 260 }; 261 262 template <typename Self> 263 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> { 264 EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, 265 typename Self::CoeffReturnType* data) { 266 using Scalar = typename Self::CoeffReturnType; 267 self.device().parallelFor( 268 self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), 269 // Make the shard size large enough that two neighboring threads 270 // won't write to the same cacheline of `data`. 271 [=](Index blk_size) { 272 return AdjustBlockSize(sizeof(Scalar), blk_size); 273 }, 274 [&](Index first, Index last) { 275 for (Index idx2 = first; idx2 < last; ++idx2) { 276 ReduceScalar(self, idx1 + idx2, data); 277 } 278 }); 279 } 280 }; 281 282 // Specialization for multi-threaded execution. 283 template <typename Self, typename Reducer, bool Vectorize> 284 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> { 285 void operator()(Self& self, typename Self::CoeffReturnType* data) { 286 using Scalar = typename Self::CoeffReturnType; 287 using Packet = typename Self::PacketReturnType; 288 const int PacketSize = internal::unpacket_traits<Packet>::size; 289 const Index total_size = internal::array_prod(self.dimensions()); 290 const Index inner_block_size = self.stride() * self.size(); 291 bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size)); 292 293 if ((parallelize_by_outer_blocks && total_size <= 4096) || 294 (!parallelize_by_outer_blocks && self.stride() < PacketSize)) { 295 ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher; 296 launcher(self, data); 297 return; 298 } 299 300 if (parallelize_by_outer_blocks) { 301 // Parallelize over outer blocks. 302 const Index num_outer_blocks = total_size / inner_block_size; 303 self.device().parallelFor( 304 num_outer_blocks, 305 TensorOpCost(inner_block_size, inner_block_size, 306 16 * PacketSize * inner_block_size, Vectorize, 307 PacketSize), 308 [=](Index blk_size) { 309 return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); 310 }, 311 [&](Index first, Index last) { 312 for (Index idx1 = first; idx1 < last; ++idx1) { 313 ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer; 314 block_reducer(self, idx1 * inner_block_size, data); 315 } 316 }); 317 } else { 318 // Parallelize over inner packets/scalars dimensions when the reduction 319 // axis is not an inner dimension. 320 ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer; 321 for (Index idx1 = 0; idx1 < total_size; 322 idx1 += self.stride() * self.size()) { 323 block_reducer(self, idx1, data); 324 } 325 } 326 } 327 }; 328 #endif // EIGEN_USE_THREADS 329 330 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) 331 332 // GPU implementation of scan 333 // TODO(ibab) This placeholder implementation performs multiple scans in 334 // parallel, but it would be better to use a parallel scan algorithm and 335 // optimize memory access. 336 template <typename Self, typename Reducer> 337 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { 338 // Compute offset as in the CPU version 339 Index val = threadIdx.x + blockIdx.x * blockDim.x; 340 Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); 341 342 if (offset + (self.size() - 1) * self.stride() < total_size) { 343 // Compute the scan along the axis, starting at the calculated offset 344 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 345 for (Index idx = 0; idx < self.size(); idx++) { 346 Index curr = offset + idx * self.stride(); 347 if (self.exclusive()) { 348 data[curr] = self.accumulator().finalize(accum); 349 self.accumulator().reduce(self.inner().coeff(curr), &accum); 350 } else { 351 self.accumulator().reduce(self.inner().coeff(curr), &accum); 352 data[curr] = self.accumulator().finalize(accum); 353 } 354 } 355 } 356 __syncthreads(); 357 358 } 359 360 template <typename Self, typename Reducer, bool Vectorize> 361 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> { 362 void operator()(const Self& self, typename Self::CoeffReturnType* data) { 363 Index total_size = internal::array_prod(self.dimensions()); 364 Index num_blocks = (total_size / self.size() + 63) / 64; 365 Index block_size = 64; 366 367 LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); 368 } 369 }; 370 #endif // EIGEN_USE_GPU && (EIGEN_GPUCC) 371 372 } // namespace internal 373 374 // Eval as rvalue 375 template <typename Op, typename ArgType, typename Device> 376 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { 377 378 typedef TensorScanOp<Op, ArgType> XprType; 379 typedef typename XprType::Index Index; 380 typedef const ArgType ChildTypeNoConst; 381 typedef const ArgType ChildType; 382 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; 383 typedef DSizes<Index, NumDims> Dimensions; 384 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; 385 typedef typename XprType::CoeffReturnType CoeffReturnType; 386 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 387 typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; 388 typedef StorageMemory<Scalar, Device> Storage; 389 typedef typename Storage::Type EvaluatorPointerType; 390 391 enum { 392 IsAligned = false, 393 PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), 394 BlockAccess = false, 395 PreferBlockAccess = false, 396 Layout = TensorEvaluator<ArgType, Device>::Layout, 397 CoordAccess = false, 398 RawAccess = true 399 }; 400 401 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 402 typedef internal::TensorBlockNotImplemented TensorBlock; 403 //===--------------------------------------------------------------------===// 404 405 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 406 : m_impl(op.expression(), device), 407 m_device(device), 408 m_exclusive(op.exclusive()), 409 m_accumulator(op.accumulator()), 410 m_size(m_impl.dimensions()[op.axis()]), 411 m_stride(1), m_consume_dim(op.axis()), 412 m_output(NULL) { 413 414 // Accumulating a scalar isn't supported. 415 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 416 eigen_assert(op.axis() >= 0 && op.axis() < NumDims); 417 418 // Compute stride of scan axis 419 const Dimensions& dims = m_impl.dimensions(); 420 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 421 for (int i = 0; i < op.axis(); ++i) { 422 m_stride = m_stride * dims[i]; 423 } 424 } else { 425 // dims can only be indexed through unsigned integers, 426 // so let's use an unsigned type to let the compiler knows. 427 // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function" 428 unsigned int axis = internal::convert_index<unsigned int>(op.axis()); 429 for (unsigned int i = NumDims - 1; i > axis; --i) { 430 m_stride = m_stride * dims[i]; 431 } 432 } 433 } 434 435 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { 436 return m_impl.dimensions(); 437 } 438 439 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { 440 return m_stride; 441 } 442 443 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { 444 return m_consume_dim; 445 } 446 447 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { 448 return m_size; 449 } 450 451 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { 452 return m_accumulator; 453 } 454 455 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { 456 return m_exclusive; 457 } 458 459 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { 460 return m_impl; 461 } 462 463 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { 464 return m_device; 465 } 466 467 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { 468 m_impl.evalSubExprsIfNeeded(NULL); 469 internal::ScanLauncher<Self, Op, Device> launcher; 470 if (data) { 471 launcher(*this, data); 472 return false; 473 } 474 475 const Index total_size = internal::array_prod(dimensions()); 476 m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar)))); 477 launcher(*this, m_output); 478 return true; 479 } 480 481 template<int LoadMode> 482 EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { 483 return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); 484 } 485 486 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const 487 { 488 return m_output; 489 } 490 491 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 492 { 493 return m_output[index]; 494 } 495 496 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { 497 return TensorOpCost(sizeof(CoeffReturnType), 0, 0); 498 } 499 500 EIGEN_STRONG_INLINE void cleanup() { 501 if (m_output) { 502 m_device.deallocate_temp(m_output); 503 m_output = NULL; 504 } 505 m_impl.cleanup(); 506 } 507 508 #ifdef EIGEN_USE_SYCL 509 // binding placeholder accessors to a command group handler for SYCL 510 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { 511 m_impl.bind(cgh); 512 m_output.bind(cgh); 513 } 514 #endif 515 protected: 516 TensorEvaluator<ArgType, Device> m_impl; 517 const Device EIGEN_DEVICE_REF m_device; 518 const bool m_exclusive; 519 Op m_accumulator; 520 const Index m_size; 521 Index m_stride; 522 Index m_consume_dim; 523 EvaluatorPointerType m_output; 524 }; 525 526 } // end namespace Eigen 527 528 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 529