xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qrelu.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/Dispatch.h>
5 #include <ATen/TensorIterator.h>
6 #include <ATen/native/cpu/Loops.h>
7 #include <ATen/native/quantized/AffineQuantizer.h>
8 #include <ATen/native/quantized/cpu/init_qnnpack.h>
9 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
10 #include <ATen/native/quantized/cpu/QuantizedOps.h>
11 #include <c10/util/irange.h>
12 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
13 #include <torch/library.h>
14 
15 #ifndef AT_PER_OPERATOR_HEADERS
16 #include <ATen/Functions.h>
17 #include <ATen/NativeFunctions.h>
18 #else
19 #include <ATen/ops/_empty_affine_quantized.h>
20 #include <ATen/ops/_prelu_kernel_native.h>
21 #include <ATen/ops/hardtanh_native.h>
22 #include <ATen/ops/leaky_relu_native.h>
23 #include <ATen/ops/prelu.h>
24 #include <ATen/ops/prelu_native.h>
25 #include <ATen/ops/quantize_per_tensor.h>
26 #include <ATen/ops/relu_native.h>
27 #endif
28 
29 #include <algorithm>
30 
31 namespace at {
32 namespace native {
33 
34 DEFINE_DISPATCH(qrelu_stub);
35 DEFINE_DISPATCH(qrelu_leaky_stub);
36 DEFINE_DISPATCH(qprelu_stub);
37 
38 #ifdef USE_PYTORCH_QNNPACK
qnnpack_relu(Tensor input)39 static Tensor qnnpack_relu(Tensor input) {
40   Tensor qy;
41   TORCH_CHECK(
42       input.ndimension() > 0, "qnnpack_relu(): Got empty input tensor");
43   TORCH_CHECK(input.scalar_type() == c10::kQUInt8,
44                "qnnpack_relu(): Expected input data type ",
45                toString(c10::kQUInt8),
46                " but got ",
47                toString(input.scalar_type()));
48 
49   Tensor input_contig = input.contiguous(input.suggest_memory_format());
50 
51   const auto zero_point = input_contig.q_zero_point();
52 
53   initQNNPACK();
54 
55   size_t num_elems = 1;
56   for (const auto i : c10::irange(1, input_contig.ndimension())) {
57     num_elems *= input_contig.size(i);
58   }
59 
60   pytorch_qnnp_operator_t qnnpack_operator{nullptr};
61 
62   const pytorch_qnnp_status createStatus = pytorch_qnnp_create_clamp_nc_u8(
63       num_elems /* channels */,
64       zero_point /* output min */,
65       std::numeric_limits<uint8_t>::max() /* output max */,
66       0 /* flags */,
67       &qnnpack_operator);
68 
69   std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
70       qnnpack_uniq_ptr(qnnpack_operator);
71 
72   TORCH_INTERNAL_ASSERT(
73       createStatus == pytorch_qnnp_status_success,
74       "failed to create QNNPACK Relu operator");
75 
76   qy = at::_empty_affine_quantized(
77       input_contig.sizes(),
78       at::device(kCPU).dtype(input.scalar_type()),
79       input_contig.q_scale(),
80       input_contig.q_zero_point(),
81       input.suggest_memory_format());
82 
83   const pytorch_qnnp_status setupStatus = pytorch_qnnp_setup_clamp_nc_u8(
84       qnnpack_operator, /* clamp */
85       input_contig.size(0) /* batch size */,
86       (uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
87       num_elems /* input stride */,
88       (uint8_t*)qy.data_ptr<c10::quint8>() /* output data */,
89       num_elems /* output stride */);
90   TORCH_INTERNAL_ASSERT(
91       setupStatus == pytorch_qnnp_status_success,
92       "failed to setup QNNPACK Relu operator");
93 
94   pthreadpool_t threadpool = caffe2::pthreadpool_();
95 
96   const pytorch_qnnp_status runStatus =
97       pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
98 
99   TORCH_INTERNAL_ASSERT(
100       runStatus == pytorch_qnnp_status_success,
101       "failed to run QNNPACK Relu operator");
102   return qy;
103 }
104 #endif
105 
relu_quantized_cpu(const Tensor & qx)106 Tensor relu_quantized_cpu(const Tensor& qx) {
107   #ifdef USE_PYTORCH_QNNPACK
108   if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8) {
109     return qnnpack_relu(qx);
110   }
111   #endif
112   Tensor qy;
113   qrelu_stub(qx.device().type(), qx, qy);
114   return qy;
115 }
relu_quantized_cpu_(Tensor & qx)116 Tensor& relu_quantized_cpu_(Tensor& qx) {
117   const auto zero_point = qx.q_zero_point();
118   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qrelu", [&]() {
119     using Vec = Vectorized<scalar_t>;
120     auto iter = TensorIterator::unary_op(qx, qx);
121     auto zero_point_vec = Vec(scalar_t(zero_point));
122     cpu_kernel_vec(
123         iter,
124         [&](scalar_t value) -> scalar_t {
125           return scalar_t(std::max<underlying_t>(value.val_, zero_point));
126         },
127         [&](Vec value) -> Vec { return value.relu(zero_point_vec); });
128   });
129   return qx;
130 }
131 
leaky_relu_out_quantized_cpu(const Tensor & self,const Scalar & negval,Tensor & result)132 Tensor& leaky_relu_out_quantized_cpu(const Tensor& self,
133                                  const Scalar& negval, Tensor& result) {
134   qrelu_leaky_stub(self.device().type(), result, self, negval);
135   return result;
136 }
137 
leaky_relu_quantized_cpu(const Tensor & self,const Scalar & negval)138 Tensor leaky_relu_quantized_cpu(const Tensor& self, const Scalar& negval) {
139   const auto qx = self.contiguous(self.suggest_memory_format());
140   auto qy = at::_empty_affine_quantized(qx.sizes(),
141       at::device(kCPU).dtype(self.scalar_type()),
142       qx.q_scale(),
143       qx.q_zero_point(),
144       self.suggest_memory_format());
145   qrelu_leaky_stub(self.device().type(), qy, qx, negval);
146   return qy;
147 }
148 
leaky_relu_quantized_cpu_(Tensor & self,const Scalar & negval)149 Tensor& leaky_relu_quantized_cpu_(Tensor& self, const Scalar& negval) {
150   qrelu_leaky_stub(self.device().type(), self, self, negval);
151   return self;
152 }
153 
_prelu_kernel_quantized_cpu_impl(const Tensor & self,const Tensor & weight,double output_scale,int64_t output_zero_point)154 static Tensor _prelu_kernel_quantized_cpu_impl(const Tensor& self, const Tensor& weight,
155                                 double output_scale, int64_t output_zero_point) {
156   auto ndim = self.dim();
157   // for ndim < 1 or > 5, go to reference path
158   if (ndim > 5 || ndim < 1) {
159     auto x = self.dequantize();
160     auto y = at::prelu(x, weight);
161     return at::quantize_per_tensor(y, output_scale, output_zero_point, c10::kQUInt8);
162   }
163 
164   auto qy = at::_empty_affine_quantized(self.sizes(),
165       at::device(kCPU)
166         .dtype(self.scalar_type()),
167       output_scale,
168       output_zero_point,
169       self.suggest_memory_format());
170 
171   qprelu_stub(self.device().type(), qy, self, weight);
172 
173   return qy;
174 }
175 
_prelu_kernel_quantized_cpu(const Tensor & self,const Tensor & weight)176 Tensor _prelu_kernel_quantized_cpu(const Tensor& self, const Tensor& weight) {
177   return _prelu_kernel_quantized_cpu_impl(self, weight, self.q_scale(), self.q_zero_point());
178 }
179 
180 namespace {
quantized_relu6(const Tensor & qx)181 Tensor quantized_relu6(const Tensor& qx) {
182   Tensor qy;
183   qy = hardtanh_quantized_cpu(qx, 0.0f, 6.0f);
184   return qy;
185 }
186 
quantized_relu6_(Tensor & qx)187 Tensor quantized_relu6_(Tensor& qx) {
188   hardtanh_quantized_cpu_(qx, 0.0f, 6.0f);
189   return qx;
190 }
191 
192 class QRelu6 final {
193  public:
run(Tensor qx,bool inplace)194   static Tensor run(Tensor qx, bool inplace) {
195     if (inplace) {
196       return quantized_relu6_(qx);
197     } else {
198       return quantized_relu6(qx);
199     }
200   }
201 };
202 
203 class QLeakyRelu final {
204  public:
run(Tensor self,const Scalar & negative_slope,bool inplace,double output_scale,int64_t output_zero_point)205   static Tensor run(Tensor self, const Scalar& negative_slope, bool inplace, double output_scale, int64_t output_zero_point) {
206     // inplace argument is ignored now, TODO:support inplace
207     if (inplace) {
208       TORCH_WARN("inplace=True is not supported for quantized::leaky_relu yet");
209     }
210     const auto qx = self.contiguous(self.suggest_memory_format());
211     auto qy = at::_empty_affine_quantized(qx.sizes(),
212       at::device(kCPU).dtype(self.scalar_type()),
213       output_scale,
214       output_zero_point,
215       self.suggest_memory_format());
216     qrelu_leaky_stub(self.device().type(), qy, qx, negative_slope);
217     return qy;
218   }
219 };
220 
221 class QPRelu final {
222  public:
run(Tensor self,const Tensor & weight,double output_scale,int64_t output_zero_point)223   static Tensor run(Tensor self, const Tensor& weight, double output_scale, int64_t output_zero_point) {
224   return _prelu_kernel_quantized_cpu_impl(self, weight, output_scale, output_zero_point);
225   }
226 };
227 
TORCH_LIBRARY_IMPL(quantized,QuantizedCPU,m)228 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
229   m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run));
230   m.impl(TORCH_SELECTIVE_NAME("quantized::leaky_relu"), TORCH_FN(QLeakyRelu::run));
231   m.impl(TORCH_SELECTIVE_NAME("quantized::prelu"), TORCH_FN(QPRelu::run));
232 }
233 
234 } // namespace
235 
236 }}  // namespace at::native
237