xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cpu/SerialStackImpl.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 // Copyright 2004-present Facebook. All Rights Reserved.
2 #pragma once
3 
4 #include <ATen/core/Tensor.h>
5 
6 #include <ATen/MemoryOverlap.h>
7 #include <ATen/Parallel.h>
8 #include <ATen/TensorIterator.h>
9 #include <ATen/cpu/vec/functional.h>
10 #include <ATen/cpu/vec/vec.h>
11 #include <c10/util/irange.h>
12 
13 namespace at::native::detail {
14 
15 struct InputMeta {
16   void* data_ptr;
17   int64_t inner_size;
18 
InputMetaInputMeta19   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
20       : data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {}
21 };
22 
23 // This kernel is used by two TensorList types:
24 // 1. stack_serial_kernel uses at::ArrayRef<Tensor>
25 // 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with
26 //    ProcessedNodeInputWrapper.
27 // When making changes, make sure that they are compatible with both types!
28 template <typename scalar_t, typename TensorListType>
stack_serial_kernel_impl(Tensor & result,TensorListType tensors,int64_t dim)29 void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) {
30   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
31       dim >= 0 && dim <= result.dim(),
32       "dim out of range in stack_serial_kernel_impl");
33   int64_t outer =
34       result.numel() / (result.sizes()[dim] * result.strides()[dim]);
35   scalar_t* result_data = result.data_ptr<scalar_t>();
36   int64_t ninputs = tensors.size();
37   std::vector<InputMeta> inputs;
38   inputs.reserve(ninputs);
39   for (const auto& tensor : tensors) {
40     inputs.emplace_back(tensor, dim, tensor.strides()[dim]);
41   }
42 
43   using Vec = vec::Vectorized<scalar_t>;
44   scalar_t* result_ptr = result_data;
45   for (const auto i : c10::irange(outer)) {
46     for (const auto j : c10::irange(ninputs)) {
47       int64_t local_inner = inputs[j].inner_size;
48       scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
49 
50       if (local_inner < Vec::size()) {
51         for (const auto k : c10::irange(local_inner)) {
52           result_ptr[k] = input_ptr[k];
53         }
54       } else {
55         vec::map(
56             [](Vec x) { return x; }, result_ptr, input_ptr, local_inner);
57       }
58       result_ptr += local_inner;
59     }
60   }
61 }
62 
63 // Checks to see whether native stack can be invoked under these conditions:
64 // - result and input tensors are contiguous
65 // - only one thread is used
66 // - no type promotion has to occur
67 // - tensors dtype is Double or Float
68 template <typename TensorListType>
can_use_native_serial_stack_impl(Tensor & result,TensorListType tensors,int64_t dim)69 bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) {
70   TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
71   const Tensor& first_tensor = tensors[0];
72   // stack dimension should be in range [0,firstTensor.dim())
73   // dim == firstTensor.dim() is a valid input, but it is handled by default code path
74   // that uses unsqueeze
75   if (dim >= first_tensor.dim()) return false;
76   // Native stack doesn't apply any tensor is skipped.
77   if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false;
78   // there should be no type promotion
79   if (result.dtype() != first_tensor.dtype()) return false;
80 
81   auto first_tensor_mem_format = first_tensor.suggest_memory_format();
82   ScalarType dtype = first_tensor.scalar_type();
83 
84   if (!result.is_contiguous(first_tensor_mem_format)) {
85     return false;
86   }
87 
88   // fast path only works for Double and Float
89   if (dtype != ScalarType::Double && dtype != ScalarType::Float) {
90     return false;
91   }
92 
93   // check remainder of inputs
94 #ifndef STRIP_ERROR_MESSAGES
95   auto const &first_tensor_shape = first_tensor.sizes();
96 #endif
97   for (const auto i : c10::irange(1, tensors.size())) {
98     auto const &tensor = tensors[i];
99     TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(),
100       "stack expects each tensor to be equal size, but got ", first_tensor_shape,
101       " at entry 0 and ", tensor.sizes(), " at entry ", i);
102 
103     // every tensor must be contiguous
104     // tensor sizes and strides must be the same
105     // there should be no type promotion
106     if (!tensor.is_contiguous(first_tensor_mem_format) ||
107       tensor.strides() != first_tensor.strides() ||
108       tensor.dtype() != dtype) {
109       return false;
110     }
111   }
112 
113   // fast native stack should only be used when it is not worth using multiple threads
114   // or there is only one thread. Note that we aren't checking result.numel() here because
115   // it may not have been resized and we want to defer that cost till later.
116   int64_t numel_in_stack = first_tensor.numel() * tensors.size();
117   return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
118 }
119 
120 template <typename TensorListType, bool should_skip_overlap_check>
121 struct CanUseNativeSerialStack;
122 
123 template <typename TensorListType>
124 struct CanUseNativeSerialStack<TensorListType, false> {
125   static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
126     // Inputs cannot alias the output tensor
127     for (const auto i : c10::irange(tensors.size())) {
128       auto lap = at::get_overlap_status(result, tensors[i]);
129       TORCH_CHECK(lap != at::MemOverlapStatus::Partial &&
130           lap != at::MemOverlapStatus::Full, 0,
131           "unsupported operation: the input tensors cannot refer to any of the "
132           "output memory locations. Found overlap in input tensor ", i);
133     }
134 
135     return can_use_native_serial_stack_impl(result, tensors, dim);
136   }
137 };
138 
139 template <typename TensorListType>
140 struct CanUseNativeSerialStack<TensorListType, true> {
141   static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
142     return can_use_native_serial_stack_impl(result, tensors, dim);
143   }
144 };
145 
146 } // namespace at::native::detail
147