1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/Dispatch.h>
5 #include <ATen/Parallel.h>
6 #include <ATen/native/quantized/cpu/QuantizedOps.h>
7
8 #ifndef AT_PER_OPERATOR_HEADERS
9 #include <ATen/Functions.h>
10 #include <ATen/NativeFunctions.h>
11 #else
12 #include <ATen/ops/_adaptive_avg_pool2d_native.h>
13 #include <ATen/ops/_adaptive_avg_pool3d_native.h>
14 #include <ATen/ops/_empty_affine_quantized.h>
15 #include <ATen/ops/adaptive_avg_pool3d_native.h>
16 #endif
17
18 #include <c10/util/irange.h>
19
20 #include <algorithm>
21 #include <cmath>
22 #include <limits>
23 #include <vector>
24
25 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
26
27 namespace at {
28 namespace native {
29
30 DEFINE_DISPATCH(qadaptive_avg_pool2d_nhwc_stub);
31 DEFINE_DISPATCH(qadaptive_avg_pool3d_ndhwc_stub);
32
33 namespace {
34
start_index(int out_idx,int out_len,int in_len)35 inline int start_index(int out_idx, int out_len, int in_len) {
36 /*
37 * out_idx: the current index of output matrix
38 * out_len: the dimension_size of output matrix
39 * in_len: the dimension_size of input matrix
40 * Basically, in_len / out_len gives the number of
41 * elements in each average computation.
42 * This function computes the start index on input matrix.
43 */
44 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
45 return (int)std::floor((float)(out_idx * in_len) / out_len);
46 }
47
end_index(int out_idx,int out_len,int in_len)48 inline int end_index(int out_idx, int out_len, int in_len) {
49 /*
50 * Parameter definition is the same as start_index.
51 * This function computes the end index on input matrix.
52 */
53 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
54 return (int)std::ceil((float)((out_idx + 1) * in_len) / out_len);
55 }
56
57 // adaptive avg pool for 2D and 3D inputs
58 template <typename scalar_t>
adaptive_avg_pool_single_out_frame(scalar_t * input_p,scalar_t * output_p,int64_t sizeC,int64_t isizeD,int64_t isizeH,int64_t isizeW,int64_t osizeD,int64_t osizeH,int64_t osizeW,int64_t istrideC,int64_t istrideD,int64_t istrideH,int64_t istrideW)59 static void adaptive_avg_pool_single_out_frame(
60 scalar_t* input_p,
61 scalar_t* output_p,
62 int64_t sizeC,
63 int64_t isizeD, // Set to 1 for 2D
64 int64_t isizeH,
65 int64_t isizeW,
66 int64_t osizeD, // Set to 1 for 2D
67 int64_t osizeH,
68 int64_t osizeW,
69 int64_t istrideC,
70 int64_t istrideD, // Set to 1 for 2D
71 int64_t istrideH,
72 int64_t istrideW) {
73 at::parallel_for(0, sizeC, 0, [&](int64_t start, int64_t end) {
74 for (const auto c : c10::irange(start, end)) {
75 /* loop over output */
76 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
77 int64_t od, oh, ow;
78 for (od = 0; od < osizeD; od++) {
79 int istartD = start_index(od, osizeD, isizeD);
80 int iendD = end_index(od, osizeD, isizeD);
81 int kD = iendD - istartD;
82 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
83 float kDr = 1.0 / kD;
84 for (oh = 0; oh < osizeH; oh++) {
85 int istartH = start_index(oh, osizeH, isizeH);
86 int iendH = end_index(oh, osizeH, isizeH);
87 int kH = iendH - istartH;
88 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
89 float kDHr = kDr / kH;
90
91 for (ow = 0; ow < osizeW; ow++) {
92 int istartW = start_index(ow, osizeW, isizeW);
93 int iendW = end_index(ow, osizeW, isizeW);
94 int kW = iendW - istartW;
95 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
96 float kDHWr = kDHr / kW;
97
98 /* local pointers */
99 scalar_t* ip = input_p +
100 c * istrideC +
101 istartD * istrideD +
102 istartH * istrideH +
103 istartW * istrideW;
104 scalar_t* op = output_p +
105 c * osizeD * osizeH * osizeW +
106 od * osizeH * osizeW +
107 oh * osizeW +
108 ow;
109
110 /* compute local average: */
111 int64_t sum = 0;
112 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
113 int id, ih, iw;
114 for (id = 0; id < kD; id++) {
115 for (ih = 0; ih < kH; ih++) {
116 for (iw = 0; iw < kW; iw++) {
117 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
118 int64_t val = (ip +
119 id * istrideD +
120 ih * istrideH +
121 iw * istrideW)->val_;
122 sum += val;
123 }
124 }
125 }
126
127 /* set output to local average */
128 // TODO: add the max/min clip
129 op->val_ = static_cast<typename scalar_t::underlying>(
130 std::nearbyint(sum * kDHWr));
131 } // ow
132 } // oh
133 } // od
134 }
135 });
136 }
137
138 template <int64_t DIM>
get_output_shape(const Tensor & input,IntArrayRef output_size)139 std::vector<int64_t> get_output_shape(
140 const Tensor& input,
141 IntArrayRef output_size) {
142 for (const auto i : c10::irange(1, input.dim())) {
143 // Allow for empty batch.
144 TORCH_CHECK(
145 input.size(i) > 0,
146 "adaptive_avg_pooling", DIM, "d(): ",
147 "expected input to have non-empty spatial "
148 "dimensions, but input has sizes ",
149 input.sizes(),
150 " with dimension ",
151 i,
152 " being empty");
153 }
154
155 TORCH_CHECK(
156 (input.dim() == DIM + 1 || input.dim() == DIM + 2),
157 "non-empty ",
158 DIM + 1,
159 "D or ",
160 DIM + 2,
161 "D (batch mode) tensor expected for input");
162
163 /* Channels */
164 const int64_t sizeC = input.size(-(DIM+1));
165
166 std::vector<int64_t> output_shape;
167 output_shape.reserve(input.dim());
168 if (input.dim() == DIM + 2) {
169 // Include Batch
170 output_shape.push_back(input.size(0));
171 }
172 output_shape.push_back(sizeC);
173 for (const auto size : output_size) {
174 output_shape.push_back(size);
175 }
176 return output_shape;
177
178 }
179
180 template <int32_t kSpatialDim, typename scalar_t>
_adaptive_avg_pool(const Tensor & input,IntArrayRef output_size,Tensor & output)181 Tensor _adaptive_avg_pool(const Tensor& input,
182 IntArrayRef output_size,
183 Tensor& output) {
184 const auto output_shape = get_output_shape<kSpatialDim>(input, output_size);
185 /* sizes */
186 int64_t sizeC = input.size(-(kSpatialDim + 1));
187 int64_t isizeD = kSpatialDim == 2 ? 1 : input.size(-3);
188 int64_t isizeH = input.size(-2);
189 int64_t isizeW = input.size(-1);
190
191 auto osizeD = kSpatialDim == 2 ? 1 : output_shape[output_shape.size() - 3];
192 auto osizeH = output_shape[output_shape.size() - 2];
193 auto osizeW = output_shape[output_shape.size() - 1];
194
195 int64_t sizeB = output_shape.size() ==(kSpatialDim + 1) ? 1 : output_shape[0];
196 if (input.is_contiguous(c10::MemoryFormat::ChannelsLast) ||
197 input.is_contiguous(c10::MemoryFormat::ChannelsLast3d)) {
198 // Fast path for NDHWC
199 auto in_stride = input.strides();
200 output = at::_empty_affine_quantized(
201 output_shape,
202 input.options().memory_format(input.suggest_memory_format()),
203 input.q_scale(),
204 input.q_zero_point(),
205 std::nullopt);
206
207 qadaptive_avg_pool3d_ndhwc_stub(
208 input.device().type(),
209 input,
210 output,
211 sizeB,
212 sizeC,
213 isizeD,
214 isizeH,
215 isizeW,
216 osizeD,
217 osizeH,
218 osizeW,
219 in_stride[0],
220 in_stride[in_stride.size() - (kSpatialDim + 1)],
221 in_stride[in_stride.size() - kSpatialDim],
222 in_stride[in_stride.size() - 2],
223 in_stride[in_stride.size() - 1]);
224 return output;
225 } else {
226 output = at::_empty_affine_quantized(
227 output_shape, input.options(), input.q_scale(), input.q_zero_point());
228 auto input_contig = input.contiguous();
229 auto input_data = input_contig.data_ptr<scalar_t>();
230 auto output_data = output.data_ptr<scalar_t>();
231 auto in_stride = input_contig.strides();
232
233 adaptive_avg_pool_single_out_frame<scalar_t>(
234 input_data,
235 output_data,
236 // Contract batch and channels into one dimension
237 sizeB * sizeC,
238 isizeD,
239 isizeH,
240 isizeW,
241 osizeD,
242 osizeH,
243 osizeW,
244 in_stride[in_stride.size() - (kSpatialDim + 1)],
245 in_stride[in_stride.size() - kSpatialDim],
246 in_stride[in_stride.size() - 2],
247 in_stride[in_stride.size() - 1]);
248 return output;
249 }
250 }
251
252 template <typename scalar_t>
q_adaptive_avg_pool2d(const Tensor & input,IntArrayRef output_size)253 Tensor q_adaptive_avg_pool2d(const Tensor& input, IntArrayRef output_size) {
254 Tensor output;
255 return _adaptive_avg_pool<2, scalar_t>(input, output_size, output);
256 }
257
258 template <typename scalar_t>
q_adaptive_avg_pool3d(Tensor & output,const Tensor & input,IntArrayRef output_size)259 Tensor q_adaptive_avg_pool3d(Tensor& output, const Tensor& input,
260 IntArrayRef output_size) {
261 return _adaptive_avg_pool<3, scalar_t>(input, output_size, output);
262 }
263
264 #ifdef USE_PYTORCH_QNNPACK
qnnpack_adaptive_avg_pool2d(const at::Tensor & input,IntArrayRef output_size)265 Tensor qnnpack_adaptive_avg_pool2d(
266 const at::Tensor& input,
267 IntArrayRef output_size) {
268 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
269 std::array<int64_t, 2> kernel_size;
270 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
271 std::array<int64_t, 2> stride;
272 std::array<int64_t, 2> padding{0, 0};
273 bool ceil_mode{false};
274 bool count_include_pad{false};
275
276 const auto output_shape = get_output_shape<2>(input, output_size);
277 auto output_height = output_shape[output_shape.size() - 2];
278 auto output_width = output_shape[output_shape.size() - 1];
279 auto input_height = input.sizes()[input.dim() - 2];
280 auto input_width = input.sizes()[input.dim() - 1];
281 stride[0] = input_height / output_height;
282 stride[1] = input_width / output_width;
283 // Given the constraint that input_height/width % output_height/width == 0
284 // stride and kernel size are same.
285 kernel_size[0] = stride[0];
286 kernel_size[1] = stride[1];
287
288 return at::native::qnnp_avgpool_helper::qnnpack_avg_pool2d(
289 input,
290 kernel_size,
291 stride,
292 padding,
293 ceil_mode,
294 count_include_pad,
295 std::nullopt);
296 }
297
enable_qnnpack_for_ada_avgpool(const at::Tensor & input,IntArrayRef output_size)298 bool enable_qnnpack_for_ada_avgpool(
299 const at::Tensor& input,
300 IntArrayRef output_size) {
301 const auto output_shape = get_output_shape<2>(input, output_size);
302 auto output_height = output_shape[output_shape.size() - 2];
303 auto output_width = output_shape[output_shape.size() - 1];
304 auto input_height = input.sizes()[input.dim() - 2];
305 auto input_width = input.sizes()[input.dim() - 1];
306
307 return !(input_width == output_width && input_height == output_height) &&
308 (input_height % output_height == 0) && (input_width % output_width == 0);
309 }
310 #endif
311 } // namespace
312
adaptive_avg_pool2d_quantized_cpu(const at::Tensor & input,IntArrayRef output_size)313 Tensor adaptive_avg_pool2d_quantized_cpu(
314 const at::Tensor& input,
315 IntArrayRef output_size) {
316 #ifdef USE_PYTORCH_QNNPACK
317 if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
318 input.scalar_type() == kQUInt8 &&
319 enable_qnnpack_for_ada_avgpool(input, output_size)) {
320 return qnnpack_adaptive_avg_pool2d(input, output_size);
321 }
322 #endif
323 Tensor output;
324 AT_DISPATCH_QINT_TYPES(
325 input.scalar_type(), "adaptive_avg_pool2d_quantized_cpu", [&]() {
326 output = q_adaptive_avg_pool2d<scalar_t>(input, output_size);
327 });
328 return output;
329 }
330
adaptive_avg_pool3d_out_quantized_cpu(const at::Tensor & input,IntArrayRef output_size,at::Tensor & output)331 Tensor& adaptive_avg_pool3d_out_quantized_cpu(
332 const at::Tensor& input,
333 IntArrayRef output_size,
334 at::Tensor& output) {
335 #ifdef USE_PYTORCH_QNNPACK
336 if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
337 TORCH_WARN("Quantized Adaptive Average Pool 3D is not implemented for ",
338 "QNNPACK. Falling back to default implementation.");
339 }
340 #endif
341 AT_DISPATCH_QINT_TYPES(
342 input.scalar_type(), "adaptive_avg_pool3d_quantized_cpu", [&]() {
343 output = q_adaptive_avg_pool3d<scalar_t>(output, input, output_size);
344 });
345 return output;
346 }
347
adaptive_avg_pool3d_quantized_cpu(const at::Tensor & input,IntArrayRef output_size)348 Tensor adaptive_avg_pool3d_quantized_cpu(
349 const at::Tensor& input,
350 IntArrayRef output_size) {
351 Tensor output;
352 return at::native::adaptive_avg_pool3d_out_quantized_cpu(input, output_size, output);
353 }
354
355 } // namespace native
356 } // namespace at
357