xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/NamedTensorUtils.h>
5 #include <ATen/native/quantized/cpu/init_qnnpack.h>
6 #include <ATen/native/quantized/cpu/QuantizedOps.h>
7 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
8 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
9 
10 #ifndef AT_PER_OPERATOR_HEADERS
11 #include <ATen/Functions.h>
12 #include <ATen/NativeFunctions.h>
13 #else
14 #include <ATen/ops/_empty_affine_quantized.h>         // for _empty_affine_q...
15 #include <ATen/ops/mean.h>                            // for mean
16 #include <ATen/ops/mean_native.h>                     // for mean_out_quanti...
17 #include <ATen/ops/quantize_per_tensor.h>             // for quantize_per_te...
18 #include <ATen/ops/std.h>
19 #include <ATen/ops/std_native.h>
20 #include <ATen/ops/zeros_like_ops.h>
21 #endif
22 
23 namespace at {
24 namespace native {
25 
26 DEFINE_DISPATCH(qmean_inner_dim_stub);
27 DEFINE_DISPATCH(qstd_inner_dim_stub);
28 
29 // If mean/std is taken in the innermost dims, the fast path can be used.
is_innnermost_dim(const Tensor & self,OptionalIntArrayRef opt_dim)30 inline bool is_innnermost_dim(
31     const Tensor& self,
32     OptionalIntArrayRef opt_dim) {
33   if (!opt_dim.has_value()) {
34     return true;
35   }
36   auto dims = opt_dim.value().vec();
37   auto ndim = self.dim();
38   maybe_wrap_dims(dims, ndim);
39   std::sort(dims.begin(), dims.end(), std::greater<int64_t>());
40   bool is_innermost = dims.empty() || dims[0] == ndim - 1;
41   for (size_t i = 1; i < dims.size(); ++i) {
42     is_innermost = is_innermost && (dims[i] == dims[i-1] - 1);
43   }
44   return is_innermost;
45 }
46 
is_mean_inner_dim_fast_path(const Tensor & self,OptionalIntArrayRef opt_dim,std::optional<ScalarType> opt_dtype)47 inline bool is_mean_inner_dim_fast_path(
48     const Tensor& self,
49     OptionalIntArrayRef opt_dim,
50     std::optional<ScalarType> opt_dtype) {
51   bool is_fast_path =
52       is_innnermost_dim(self, opt_dim) &&
53       (!opt_dtype.has_value() || opt_dtype.value() == self.scalar_type());
54   return is_fast_path;
55 }
56 
57 #ifdef USE_PYTORCH_QNNPACK
qnnpack_mean(const Tensor & input,IntArrayRef dim,bool keepdim)58 static Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim, bool keepdim) {
59   Tensor output;
60   TORCH_CHECK(
61       input.ndimension() == 4,
62       "qnnpack_global_average_pool: Expected input to be 4-dimensional: got ",
63       input.ndimension());
64   TORCH_CHECK(
65       dim.size() == 2,
66       "qnnpack_global_average_pool: dim size must be a tuple of two ints");
67   TORCH_CHECK(
68       dim[0] == 2 && dim[1] == 3,
69       "qnnpack_global_average_pool: Reduction dimensions must match last 2 dimensions of input tensor")
70 
71   const int64_t batch_size = input.size(0);
72   const int64_t inC = input.size(1);
73   const int64_t inH = input.size(2);
74   const int64_t inW = input.size(3);
75 
76   Tensor input_contig = input.contiguous(MemoryFormat::ChannelsLast);
77 
78   initQNNPACK();
79   const auto scale = input_contig.q_scale();
80   const auto zero_point = input_contig.q_zero_point();
81   const auto outC = inC;
82 
83   output = at::_empty_affine_quantized(
84       keepdim ? IntArrayRef{batch_size, outC, 1, 1}
85               : IntArrayRef{batch_size, outC},
86       at::device(kCPU).dtype(kQUInt8),
87       scale,
88       zero_point);
89 
90   pytorch_qnnp_operator_t qnnpack_operator{nullptr};
91   const pytorch_qnnp_status createStatus =
92       pytorch_qnnp_create_global_average_pooling_nwc_q8(
93           inC,
94           zero_point,
95           scale,
96           zero_point,
97           scale,
98           std::numeric_limits<uint8_t>::min() /* output min */,
99           std::numeric_limits<uint8_t>::max() /* output max */,
100           0,
101           &qnnpack_operator);
102 
103   CAFFE_ENFORCE(
104       createStatus == pytorch_qnnp_status_success,
105       "failed to create QNNPACK Global Average Pooling operator");
106   std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
107       qnnpack_uniq_ptr(qnnpack_operator);
108 
109   const pytorch_qnnp_status setupStatus =
110       pytorch_qnnp_setup_global_average_pooling_nwc_q8(
111           qnnpack_operator,
112           batch_size,
113           inH * inW,
114           (uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
115           inC,
116           (uint8_t*)output.data_ptr<c10::quint8>() /* output data */,
117           outC);
118   CAFFE_ENFORCE(
119       setupStatus == pytorch_qnnp_status_success,
120       "failed to setup QNNPACK Global Average Pooling operator");
121   pthreadpool_t threadpool = caffe2::pthreadpool_();
122   const pytorch_qnnp_status runStatus =
123       pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
124   TORCH_INTERNAL_ASSERT(
125       runStatus == pytorch_qnnp_status_success,
126       "failed to run QNNPACK Global Average Pool operator");
127   return output;
128 }
129 #endif
mean_out_quantized_cpu(const Tensor & self,OptionalIntArrayRef opt_dim,bool keepdim,std::optional<ScalarType> opt_dtype,Tensor & result)130 Tensor& mean_out_quantized_cpu(
131     const Tensor& self,
132     OptionalIntArrayRef opt_dim,
133     bool keepdim,
134     std::optional<ScalarType> opt_dtype,
135     Tensor& result) {
136 #ifdef USE_PYTORCH_QNNPACK
137   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
138       self.scalar_type() == kQUInt8 && opt_dim.has_value()) {
139     auto dim = opt_dim.value();
140     // QNNPACK currently is only supported for NCHW + dim=(2, 3)
141     // Remove these checks after generic version is implemented.
142     if (self.ndimension() == 4 && dim.size() == 2 && dim[0] == 2 && dim[1] == 3) {
143       result = qnnpack_mean(self, dim, keepdim);
144       return result;
145     }
146   }
147 #endif
148 
149   // Take average in the innermost dimensions
150   if (self.is_contiguous(c10::MemoryFormat::Contiguous) &&
151       is_mean_inner_dim_fast_path(self, opt_dim, opt_dtype)) {
152     qmean_inner_dim_stub(self.device().type(), self, opt_dim, keepdim, opt_dtype, result);
153     return result;
154   }
155   auto self_dequantized = self.dequantize();
156   auto result_dequantized = at::mean(self_dequantized, opt_dim, keepdim, opt_dtype);
157   result = at::quantize_per_tensor(
158       result_dequantized,
159       self.q_scale(),
160       self.q_zero_point(),
161       opt_dtype.value_or(self.scalar_type()));
162   return result;
163 }
164 
mean_quantized_cpu(const Tensor & self,OptionalIntArrayRef opt_dim,bool keepdim,std::optional<ScalarType> dtype)165 Tensor mean_quantized_cpu(
166     const Tensor& self,
167     OptionalIntArrayRef opt_dim,
168     bool keepdim,
169     std::optional<ScalarType> dtype) {
170   Tensor result;
171   mean_out_quantized_cpu(self, opt_dim, keepdim, dtype, result);
172   return result;
173 }
174 
175 // qstd
is_std_inner_dim_fast_path(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction)176 inline bool is_std_inner_dim_fast_path(
177     const Tensor& self,
178     OptionalIntArrayRef dim,
179     const std::optional<Scalar>& correction) {
180   // Do not enter fast path if there are too few elements
181   IntArrayRef dims = dim.has_value() ? dim.value() : IntArrayRef();
182   auto all_dims = std::vector<int64_t>(self.dim());
183   std::iota(all_dims.begin(), all_dims.end(), 0);
184   dims = dims.empty() ? all_dims : dims;
185   bool has_correction = !correction.value_or(1).equal(0);
186   int64_t num_ele = 1;
187   for (auto d : dims) {
188     num_ele *= self.size(d);
189   }
190   if (num_ele == 1 && has_correction) {
191     return false;
192   }
193   return is_innnermost_dim(self, dims);
194 }
195 
std_out_quantized_cpu(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction,bool keepdim,Tensor & result)196 Tensor& std_out_quantized_cpu(
197     const Tensor& self,
198     OptionalIntArrayRef dim,
199     const std::optional<Scalar>& correction,
200     bool keepdim,
201     Tensor& result) {
202   // Fast path
203   if (self.is_contiguous(c10::MemoryFormat::Contiguous) &&
204       is_std_inner_dim_fast_path(self, dim, correction)) {
205     qstd_inner_dim_stub(self.device().type(), self, dim, correction, keepdim, result);
206     return result;
207   }
208 
209   // Reference path
210   auto self_dequantized = self.dequantize();
211   auto result_dequantized = at::std(self_dequantized, dim, correction, keepdim);
212   result = at::quantize_per_tensor(
213       result_dequantized,
214       self.q_scale(),
215       self.q_zero_point(),
216       self.scalar_type());
217   return result;
218 }
219 
std_quantized_cpu(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction,bool keepdim)220 Tensor std_quantized_cpu(
221     const Tensor& self,
222     OptionalIntArrayRef dim,
223     const std::optional<Scalar>& correction,
224     bool keepdim) {
225   Tensor result;
226   std_out_quantized_cpu(self, dim, correction, keepdim, result);
227   return result;
228 }
229 
230 } // namespace native
231 } // namespace at
232