1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/NamedTensorUtils.h>
5 #include <ATen/native/quantized/cpu/init_qnnpack.h>
6 #include <ATen/native/quantized/cpu/QuantizedOps.h>
7 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
8 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
9
10 #ifndef AT_PER_OPERATOR_HEADERS
11 #include <ATen/Functions.h>
12 #include <ATen/NativeFunctions.h>
13 #else
14 #include <ATen/ops/_empty_affine_quantized.h> // for _empty_affine_q...
15 #include <ATen/ops/mean.h> // for mean
16 #include <ATen/ops/mean_native.h> // for mean_out_quanti...
17 #include <ATen/ops/quantize_per_tensor.h> // for quantize_per_te...
18 #include <ATen/ops/std.h>
19 #include <ATen/ops/std_native.h>
20 #include <ATen/ops/zeros_like_ops.h>
21 #endif
22
23 namespace at {
24 namespace native {
25
26 DEFINE_DISPATCH(qmean_inner_dim_stub);
27 DEFINE_DISPATCH(qstd_inner_dim_stub);
28
29 // If mean/std is taken in the innermost dims, the fast path can be used.
is_innnermost_dim(const Tensor & self,OptionalIntArrayRef opt_dim)30 inline bool is_innnermost_dim(
31 const Tensor& self,
32 OptionalIntArrayRef opt_dim) {
33 if (!opt_dim.has_value()) {
34 return true;
35 }
36 auto dims = opt_dim.value().vec();
37 auto ndim = self.dim();
38 maybe_wrap_dims(dims, ndim);
39 std::sort(dims.begin(), dims.end(), std::greater<int64_t>());
40 bool is_innermost = dims.empty() || dims[0] == ndim - 1;
41 for (size_t i = 1; i < dims.size(); ++i) {
42 is_innermost = is_innermost && (dims[i] == dims[i-1] - 1);
43 }
44 return is_innermost;
45 }
46
is_mean_inner_dim_fast_path(const Tensor & self,OptionalIntArrayRef opt_dim,std::optional<ScalarType> opt_dtype)47 inline bool is_mean_inner_dim_fast_path(
48 const Tensor& self,
49 OptionalIntArrayRef opt_dim,
50 std::optional<ScalarType> opt_dtype) {
51 bool is_fast_path =
52 is_innnermost_dim(self, opt_dim) &&
53 (!opt_dtype.has_value() || opt_dtype.value() == self.scalar_type());
54 return is_fast_path;
55 }
56
57 #ifdef USE_PYTORCH_QNNPACK
qnnpack_mean(const Tensor & input,IntArrayRef dim,bool keepdim)58 static Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim, bool keepdim) {
59 Tensor output;
60 TORCH_CHECK(
61 input.ndimension() == 4,
62 "qnnpack_global_average_pool: Expected input to be 4-dimensional: got ",
63 input.ndimension());
64 TORCH_CHECK(
65 dim.size() == 2,
66 "qnnpack_global_average_pool: dim size must be a tuple of two ints");
67 TORCH_CHECK(
68 dim[0] == 2 && dim[1] == 3,
69 "qnnpack_global_average_pool: Reduction dimensions must match last 2 dimensions of input tensor")
70
71 const int64_t batch_size = input.size(0);
72 const int64_t inC = input.size(1);
73 const int64_t inH = input.size(2);
74 const int64_t inW = input.size(3);
75
76 Tensor input_contig = input.contiguous(MemoryFormat::ChannelsLast);
77
78 initQNNPACK();
79 const auto scale = input_contig.q_scale();
80 const auto zero_point = input_contig.q_zero_point();
81 const auto outC = inC;
82
83 output = at::_empty_affine_quantized(
84 keepdim ? IntArrayRef{batch_size, outC, 1, 1}
85 : IntArrayRef{batch_size, outC},
86 at::device(kCPU).dtype(kQUInt8),
87 scale,
88 zero_point);
89
90 pytorch_qnnp_operator_t qnnpack_operator{nullptr};
91 const pytorch_qnnp_status createStatus =
92 pytorch_qnnp_create_global_average_pooling_nwc_q8(
93 inC,
94 zero_point,
95 scale,
96 zero_point,
97 scale,
98 std::numeric_limits<uint8_t>::min() /* output min */,
99 std::numeric_limits<uint8_t>::max() /* output max */,
100 0,
101 &qnnpack_operator);
102
103 CAFFE_ENFORCE(
104 createStatus == pytorch_qnnp_status_success,
105 "failed to create QNNPACK Global Average Pooling operator");
106 std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
107 qnnpack_uniq_ptr(qnnpack_operator);
108
109 const pytorch_qnnp_status setupStatus =
110 pytorch_qnnp_setup_global_average_pooling_nwc_q8(
111 qnnpack_operator,
112 batch_size,
113 inH * inW,
114 (uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
115 inC,
116 (uint8_t*)output.data_ptr<c10::quint8>() /* output data */,
117 outC);
118 CAFFE_ENFORCE(
119 setupStatus == pytorch_qnnp_status_success,
120 "failed to setup QNNPACK Global Average Pooling operator");
121 pthreadpool_t threadpool = caffe2::pthreadpool_();
122 const pytorch_qnnp_status runStatus =
123 pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
124 TORCH_INTERNAL_ASSERT(
125 runStatus == pytorch_qnnp_status_success,
126 "failed to run QNNPACK Global Average Pool operator");
127 return output;
128 }
129 #endif
mean_out_quantized_cpu(const Tensor & self,OptionalIntArrayRef opt_dim,bool keepdim,std::optional<ScalarType> opt_dtype,Tensor & result)130 Tensor& mean_out_quantized_cpu(
131 const Tensor& self,
132 OptionalIntArrayRef opt_dim,
133 bool keepdim,
134 std::optional<ScalarType> opt_dtype,
135 Tensor& result) {
136 #ifdef USE_PYTORCH_QNNPACK
137 if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
138 self.scalar_type() == kQUInt8 && opt_dim.has_value()) {
139 auto dim = opt_dim.value();
140 // QNNPACK currently is only supported for NCHW + dim=(2, 3)
141 // Remove these checks after generic version is implemented.
142 if (self.ndimension() == 4 && dim.size() == 2 && dim[0] == 2 && dim[1] == 3) {
143 result = qnnpack_mean(self, dim, keepdim);
144 return result;
145 }
146 }
147 #endif
148
149 // Take average in the innermost dimensions
150 if (self.is_contiguous(c10::MemoryFormat::Contiguous) &&
151 is_mean_inner_dim_fast_path(self, opt_dim, opt_dtype)) {
152 qmean_inner_dim_stub(self.device().type(), self, opt_dim, keepdim, opt_dtype, result);
153 return result;
154 }
155 auto self_dequantized = self.dequantize();
156 auto result_dequantized = at::mean(self_dequantized, opt_dim, keepdim, opt_dtype);
157 result = at::quantize_per_tensor(
158 result_dequantized,
159 self.q_scale(),
160 self.q_zero_point(),
161 opt_dtype.value_or(self.scalar_type()));
162 return result;
163 }
164
mean_quantized_cpu(const Tensor & self,OptionalIntArrayRef opt_dim,bool keepdim,std::optional<ScalarType> dtype)165 Tensor mean_quantized_cpu(
166 const Tensor& self,
167 OptionalIntArrayRef opt_dim,
168 bool keepdim,
169 std::optional<ScalarType> dtype) {
170 Tensor result;
171 mean_out_quantized_cpu(self, opt_dim, keepdim, dtype, result);
172 return result;
173 }
174
175 // qstd
is_std_inner_dim_fast_path(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction)176 inline bool is_std_inner_dim_fast_path(
177 const Tensor& self,
178 OptionalIntArrayRef dim,
179 const std::optional<Scalar>& correction) {
180 // Do not enter fast path if there are too few elements
181 IntArrayRef dims = dim.has_value() ? dim.value() : IntArrayRef();
182 auto all_dims = std::vector<int64_t>(self.dim());
183 std::iota(all_dims.begin(), all_dims.end(), 0);
184 dims = dims.empty() ? all_dims : dims;
185 bool has_correction = !correction.value_or(1).equal(0);
186 int64_t num_ele = 1;
187 for (auto d : dims) {
188 num_ele *= self.size(d);
189 }
190 if (num_ele == 1 && has_correction) {
191 return false;
192 }
193 return is_innnermost_dim(self, dims);
194 }
195
std_out_quantized_cpu(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction,bool keepdim,Tensor & result)196 Tensor& std_out_quantized_cpu(
197 const Tensor& self,
198 OptionalIntArrayRef dim,
199 const std::optional<Scalar>& correction,
200 bool keepdim,
201 Tensor& result) {
202 // Fast path
203 if (self.is_contiguous(c10::MemoryFormat::Contiguous) &&
204 is_std_inner_dim_fast_path(self, dim, correction)) {
205 qstd_inner_dim_stub(self.device().type(), self, dim, correction, keepdim, result);
206 return result;
207 }
208
209 // Reference path
210 auto self_dequantized = self.dequantize();
211 auto result_dequantized = at::std(self_dequantized, dim, correction, keepdim);
212 result = at::quantize_per_tensor(
213 result_dequantized,
214 self.q_scale(),
215 self.q_zero_point(),
216 self.scalar_type());
217 return result;
218 }
219
std_quantized_cpu(const Tensor & self,OptionalIntArrayRef dim,const std::optional<Scalar> & correction,bool keepdim)220 Tensor std_quantized_cpu(
221 const Tensor& self,
222 OptionalIntArrayRef dim,
223 const std::optional<Scalar>& correction,
224 bool keepdim) {
225 Tensor result;
226 std_out_quantized_cpu(self, dim, correction, keepdim, result);
227 return result;
228 }
229
230 } // namespace native
231 } // namespace at
232