xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/Dispatch.h>
5 #include <ATen/Parallel.h>
6 #include <ATen/native/quantized/cpu/QuantizedOps.h>
7 
8 #ifndef AT_PER_OPERATOR_HEADERS
9 #include <ATen/Functions.h>
10 #include <ATen/NativeFunctions.h>
11 #else
12 #include <ATen/ops/_adaptive_avg_pool2d_native.h>
13 #include <ATen/ops/_adaptive_avg_pool3d_native.h>
14 #include <ATen/ops/_empty_affine_quantized.h>
15 #include <ATen/ops/adaptive_avg_pool3d_native.h>
16 #endif
17 
18 #include <c10/util/irange.h>
19 
20 #include <algorithm>
21 #include <cmath>
22 #include <limits>
23 #include <vector>
24 
25 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
26 
27 namespace at {
28 namespace native {
29 
30 DEFINE_DISPATCH(qadaptive_avg_pool2d_nhwc_stub);
31 DEFINE_DISPATCH(qadaptive_avg_pool3d_ndhwc_stub);
32 
33 namespace {
34 
start_index(int out_idx,int out_len,int in_len)35 inline int start_index(int out_idx, int out_len, int in_len) {
36   /*
37    * out_idx: the current index of output matrix
38    * out_len: the dimension_size of output matrix
39    * in_len: the dimension_size of input matrix
40    * Basically, in_len / out_len gives the number of
41    * elements in each average computation.
42    * This function computes the start index on input matrix.
43    */
44   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
45   return (int)std::floor((float)(out_idx * in_len) / out_len);
46 }
47 
end_index(int out_idx,int out_len,int in_len)48 inline int end_index(int out_idx, int out_len, int in_len) {
49   /*
50    * Parameter definition is the same as start_index.
51    * This function computes the end index on input matrix.
52    */
53   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
54   return (int)std::ceil((float)((out_idx + 1) * in_len) / out_len);
55 }
56 
57 // adaptive avg pool for 2D and 3D inputs
58 template <typename scalar_t>
adaptive_avg_pool_single_out_frame(scalar_t * input_p,scalar_t * output_p,int64_t sizeC,int64_t isizeD,int64_t isizeH,int64_t isizeW,int64_t osizeD,int64_t osizeH,int64_t osizeW,int64_t istrideC,int64_t istrideD,int64_t istrideH,int64_t istrideW)59 static void adaptive_avg_pool_single_out_frame(
60     scalar_t* input_p,
61     scalar_t* output_p,
62     int64_t sizeC,
63     int64_t isizeD, // Set to 1 for 2D
64     int64_t isizeH,
65     int64_t isizeW,
66     int64_t osizeD, // Set to 1 for 2D
67     int64_t osizeH,
68     int64_t osizeW,
69     int64_t istrideC,
70     int64_t istrideD,  // Set to 1 for 2D
71     int64_t istrideH,
72     int64_t istrideW) {
73   at::parallel_for(0, sizeC, 0, [&](int64_t start, int64_t end) {
74     for (const auto c : c10::irange(start, end)) {
75       /* loop over output */
76       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
77       int64_t od, oh, ow;
78       for (od = 0; od < osizeD; od++) {
79         int istartD = start_index(od, osizeD, isizeD);
80         int iendD = end_index(od, osizeD, isizeD);
81         int kD = iendD - istartD;
82         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
83         float kDr = 1.0 / kD;
84         for (oh = 0; oh < osizeH; oh++) {
85           int istartH = start_index(oh, osizeH, isizeH);
86           int iendH = end_index(oh, osizeH, isizeH);
87           int kH = iendH - istartH;
88           // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
89           float kDHr = kDr / kH;
90 
91           for (ow = 0; ow < osizeW; ow++) {
92             int istartW = start_index(ow, osizeW, isizeW);
93             int iendW = end_index(ow, osizeW, isizeW);
94             int kW = iendW - istartW;
95             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
96             float kDHWr = kDHr / kW;
97 
98             /* local pointers */
99             scalar_t* ip = input_p +
100                            c * istrideC +
101                            istartD * istrideD +
102                            istartH * istrideH +
103                            istartW * istrideW;
104             scalar_t* op = output_p +
105                            c * osizeD * osizeH * osizeW +
106                            od * osizeH * osizeW +
107                            oh * osizeW +
108                            ow;
109 
110             /* compute local average: */
111             int64_t sum = 0;
112             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
113             int id, ih, iw;
114             for (id = 0; id < kD; id++) {
115               for (ih = 0; ih < kH; ih++) {
116                 for (iw = 0; iw < kW; iw++) {
117                   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
118                   int64_t val = (ip +
119                                  id * istrideD +
120                                  ih * istrideH +
121                                  iw * istrideW)->val_;
122                   sum += val;
123                 }
124               }
125             }
126 
127             /* set output to local average */
128             // TODO: add the max/min clip
129             op->val_ = static_cast<typename scalar_t::underlying>(
130                 std::nearbyint(sum * kDHWr));
131           } // ow
132         } // oh
133       } // od
134     }
135   });
136 }
137 
138 template <int64_t DIM>
get_output_shape(const Tensor & input,IntArrayRef output_size)139 std::vector<int64_t> get_output_shape(
140     const Tensor& input,
141     IntArrayRef output_size) {
142   for (const auto i : c10::irange(1, input.dim())) {
143     // Allow for empty batch.
144     TORCH_CHECK(
145         input.size(i) > 0,
146         "adaptive_avg_pooling", DIM, "d(): ",
147         "expected input to have non-empty spatial "
148         "dimensions, but input has sizes ",
149         input.sizes(),
150         " with dimension ",
151         i,
152         " being empty");
153   }
154 
155   TORCH_CHECK(
156       (input.dim() == DIM + 1 || input.dim() == DIM + 2),
157       "non-empty ",
158       DIM + 1,
159       "D or ",
160       DIM + 2,
161       "D (batch mode) tensor expected for input");
162 
163   /* Channels */
164   const int64_t sizeC = input.size(-(DIM+1));
165 
166   std::vector<int64_t> output_shape;
167   output_shape.reserve(input.dim());
168   if (input.dim() == DIM + 2) {
169     // Include Batch
170     output_shape.push_back(input.size(0));
171   }
172   output_shape.push_back(sizeC);
173   for (const auto size : output_size) {
174     output_shape.push_back(size);
175   }
176   return output_shape;
177 
178 }
179 
180 template <int32_t kSpatialDim, typename scalar_t>
_adaptive_avg_pool(const Tensor & input,IntArrayRef output_size,Tensor & output)181 Tensor _adaptive_avg_pool(const Tensor& input,
182                           IntArrayRef output_size,
183                           Tensor& output) {
184   const auto output_shape = get_output_shape<kSpatialDim>(input, output_size);
185   /* sizes */
186   int64_t sizeC = input.size(-(kSpatialDim + 1));
187   int64_t isizeD = kSpatialDim == 2 ? 1 : input.size(-3);
188   int64_t isizeH = input.size(-2);
189   int64_t isizeW = input.size(-1);
190 
191   auto osizeD = kSpatialDim == 2 ? 1 : output_shape[output_shape.size() - 3];
192   auto osizeH = output_shape[output_shape.size() - 2];
193   auto osizeW = output_shape[output_shape.size() - 1];
194 
195   int64_t sizeB = output_shape.size() ==(kSpatialDim + 1) ? 1 : output_shape[0];
196   if (input.is_contiguous(c10::MemoryFormat::ChannelsLast) ||
197       input.is_contiguous(c10::MemoryFormat::ChannelsLast3d)) {
198     // Fast path for NDHWC
199     auto in_stride = input.strides();
200     output = at::_empty_affine_quantized(
201         output_shape,
202         input.options().memory_format(input.suggest_memory_format()),
203         input.q_scale(),
204         input.q_zero_point(),
205         std::nullopt);
206 
207     qadaptive_avg_pool3d_ndhwc_stub(
208         input.device().type(),
209         input,
210         output,
211         sizeB,
212         sizeC,
213         isizeD,
214         isizeH,
215         isizeW,
216         osizeD,
217         osizeH,
218         osizeW,
219         in_stride[0],
220         in_stride[in_stride.size() - (kSpatialDim + 1)],
221         in_stride[in_stride.size() - kSpatialDim],
222         in_stride[in_stride.size() - 2],
223         in_stride[in_stride.size() - 1]);
224     return output;
225   } else {
226     output = at::_empty_affine_quantized(
227         output_shape, input.options(), input.q_scale(), input.q_zero_point());
228     auto input_contig = input.contiguous();
229     auto input_data = input_contig.data_ptr<scalar_t>();
230     auto output_data = output.data_ptr<scalar_t>();
231     auto in_stride = input_contig.strides();
232 
233     adaptive_avg_pool_single_out_frame<scalar_t>(
234         input_data,
235         output_data,
236         // Contract batch and channels into one dimension
237         sizeB * sizeC,
238         isizeD,
239         isizeH,
240         isizeW,
241         osizeD,
242         osizeH,
243         osizeW,
244         in_stride[in_stride.size() - (kSpatialDim + 1)],
245         in_stride[in_stride.size() - kSpatialDim],
246         in_stride[in_stride.size() - 2],
247         in_stride[in_stride.size() - 1]);
248     return output;
249   }
250 }
251 
252 template <typename scalar_t>
q_adaptive_avg_pool2d(const Tensor & input,IntArrayRef output_size)253 Tensor q_adaptive_avg_pool2d(const Tensor& input, IntArrayRef output_size) {
254   Tensor output;
255   return _adaptive_avg_pool<2, scalar_t>(input, output_size, output);
256 }
257 
258 template <typename scalar_t>
q_adaptive_avg_pool3d(Tensor & output,const Tensor & input,IntArrayRef output_size)259 Tensor q_adaptive_avg_pool3d(Tensor& output, const Tensor& input,
260                              IntArrayRef output_size) {
261   return _adaptive_avg_pool<3, scalar_t>(input, output_size, output);
262 }
263 
264 #ifdef USE_PYTORCH_QNNPACK
qnnpack_adaptive_avg_pool2d(const at::Tensor & input,IntArrayRef output_size)265 Tensor qnnpack_adaptive_avg_pool2d(
266     const at::Tensor& input,
267     IntArrayRef output_size) {
268   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
269   std::array<int64_t, 2> kernel_size;
270   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
271   std::array<int64_t, 2> stride;
272   std::array<int64_t, 2> padding{0, 0};
273   bool ceil_mode{false};
274   bool count_include_pad{false};
275 
276   const auto output_shape = get_output_shape<2>(input, output_size);
277   auto output_height = output_shape[output_shape.size() - 2];
278   auto output_width = output_shape[output_shape.size() - 1];
279   auto input_height = input.sizes()[input.dim() - 2];
280   auto input_width = input.sizes()[input.dim() - 1];
281   stride[0] = input_height / output_height;
282   stride[1] = input_width / output_width;
283   // Given the constraint that input_height/width % output_height/width == 0
284   // stride and kernel size are same.
285   kernel_size[0] = stride[0];
286   kernel_size[1] = stride[1];
287 
288   return at::native::qnnp_avgpool_helper::qnnpack_avg_pool2d(
289       input,
290       kernel_size,
291       stride,
292       padding,
293       ceil_mode,
294       count_include_pad,
295       std::nullopt);
296 }
297 
enable_qnnpack_for_ada_avgpool(const at::Tensor & input,IntArrayRef output_size)298 bool enable_qnnpack_for_ada_avgpool(
299     const at::Tensor& input,
300     IntArrayRef output_size) {
301   const auto output_shape = get_output_shape<2>(input, output_size);
302   auto output_height = output_shape[output_shape.size() - 2];
303   auto output_width = output_shape[output_shape.size() - 1];
304   auto input_height = input.sizes()[input.dim() - 2];
305   auto input_width = input.sizes()[input.dim() - 1];
306 
307   return !(input_width == output_width && input_height == output_height) &&
308       (input_height % output_height == 0) && (input_width % output_width == 0);
309 }
310 #endif
311 } // namespace
312 
adaptive_avg_pool2d_quantized_cpu(const at::Tensor & input,IntArrayRef output_size)313 Tensor adaptive_avg_pool2d_quantized_cpu(
314     const at::Tensor& input,
315     IntArrayRef output_size) {
316 #ifdef USE_PYTORCH_QNNPACK
317   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
318       input.scalar_type() == kQUInt8 &&
319       enable_qnnpack_for_ada_avgpool(input, output_size)) {
320     return qnnpack_adaptive_avg_pool2d(input, output_size);
321   }
322 #endif
323   Tensor output;
324   AT_DISPATCH_QINT_TYPES(
325       input.scalar_type(), "adaptive_avg_pool2d_quantized_cpu", [&]() {
326         output = q_adaptive_avg_pool2d<scalar_t>(input, output_size);
327       });
328   return output;
329 }
330 
adaptive_avg_pool3d_out_quantized_cpu(const at::Tensor & input,IntArrayRef output_size,at::Tensor & output)331 Tensor& adaptive_avg_pool3d_out_quantized_cpu(
332     const at::Tensor& input,
333     IntArrayRef output_size,
334     at::Tensor& output) {
335 #ifdef USE_PYTORCH_QNNPACK
336   if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
337     TORCH_WARN("Quantized Adaptive Average Pool 3D is not implemented for ",
338                "QNNPACK. Falling back to default implementation.");
339   }
340 #endif
341   AT_DISPATCH_QINT_TYPES(
342       input.scalar_type(), "adaptive_avg_pool3d_quantized_cpu", [&]() {
343         output = q_adaptive_avg_pool3d<scalar_t>(output, input, output_size);
344       });
345   return output;
346 }
347 
adaptive_avg_pool3d_quantized_cpu(const at::Tensor & input,IntArrayRef output_size)348 Tensor adaptive_avg_pool3d_quantized_cpu(
349     const at::Tensor& input,
350     IntArrayRef output_size) {
351   Tensor output;
352   return at::native::adaptive_avg_pool3d_out_quantized_cpu(input, output_size, output);
353 }
354 
355 } // namespace native
356 } // namespace at
357