1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/Dispatch.h>
5 #include <ATen/TensorIterator.h>
6 #include <ATen/native/cpu/Loops.h>
7 #include <ATen/native/quantized/AffineQuantizer.h>
8 #include <ATen/native/quantized/cpu/init_qnnpack.h>
9 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
10 #include <ATen/native/quantized/cpu/QuantizedOps.h>
11 #include <c10/util/irange.h>
12 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
13 #include <torch/library.h>
14
15 #ifndef AT_PER_OPERATOR_HEADERS
16 #include <ATen/Functions.h>
17 #include <ATen/NativeFunctions.h>
18 #else
19 #include <ATen/ops/_empty_affine_quantized.h>
20 #include <ATen/ops/_prelu_kernel_native.h>
21 #include <ATen/ops/hardtanh_native.h>
22 #include <ATen/ops/leaky_relu_native.h>
23 #include <ATen/ops/prelu.h>
24 #include <ATen/ops/prelu_native.h>
25 #include <ATen/ops/quantize_per_tensor.h>
26 #include <ATen/ops/relu_native.h>
27 #endif
28
29 #include <algorithm>
30
31 namespace at {
32 namespace native {
33
34 DEFINE_DISPATCH(qrelu_stub);
35 DEFINE_DISPATCH(qrelu_leaky_stub);
36 DEFINE_DISPATCH(qprelu_stub);
37
38 #ifdef USE_PYTORCH_QNNPACK
qnnpack_relu(Tensor input)39 static Tensor qnnpack_relu(Tensor input) {
40 Tensor qy;
41 TORCH_CHECK(
42 input.ndimension() > 0, "qnnpack_relu(): Got empty input tensor");
43 TORCH_CHECK(input.scalar_type() == c10::kQUInt8,
44 "qnnpack_relu(): Expected input data type ",
45 toString(c10::kQUInt8),
46 " but got ",
47 toString(input.scalar_type()));
48
49 Tensor input_contig = input.contiguous(input.suggest_memory_format());
50
51 const auto zero_point = input_contig.q_zero_point();
52
53 initQNNPACK();
54
55 size_t num_elems = 1;
56 for (const auto i : c10::irange(1, input_contig.ndimension())) {
57 num_elems *= input_contig.size(i);
58 }
59
60 pytorch_qnnp_operator_t qnnpack_operator{nullptr};
61
62 const pytorch_qnnp_status createStatus = pytorch_qnnp_create_clamp_nc_u8(
63 num_elems /* channels */,
64 zero_point /* output min */,
65 std::numeric_limits<uint8_t>::max() /* output max */,
66 0 /* flags */,
67 &qnnpack_operator);
68
69 std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
70 qnnpack_uniq_ptr(qnnpack_operator);
71
72 TORCH_INTERNAL_ASSERT(
73 createStatus == pytorch_qnnp_status_success,
74 "failed to create QNNPACK Relu operator");
75
76 qy = at::_empty_affine_quantized(
77 input_contig.sizes(),
78 at::device(kCPU).dtype(input.scalar_type()),
79 input_contig.q_scale(),
80 input_contig.q_zero_point(),
81 input.suggest_memory_format());
82
83 const pytorch_qnnp_status setupStatus = pytorch_qnnp_setup_clamp_nc_u8(
84 qnnpack_operator, /* clamp */
85 input_contig.size(0) /* batch size */,
86 (uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
87 num_elems /* input stride */,
88 (uint8_t*)qy.data_ptr<c10::quint8>() /* output data */,
89 num_elems /* output stride */);
90 TORCH_INTERNAL_ASSERT(
91 setupStatus == pytorch_qnnp_status_success,
92 "failed to setup QNNPACK Relu operator");
93
94 pthreadpool_t threadpool = caffe2::pthreadpool_();
95
96 const pytorch_qnnp_status runStatus =
97 pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
98
99 TORCH_INTERNAL_ASSERT(
100 runStatus == pytorch_qnnp_status_success,
101 "failed to run QNNPACK Relu operator");
102 return qy;
103 }
104 #endif
105
relu_quantized_cpu(const Tensor & qx)106 Tensor relu_quantized_cpu(const Tensor& qx) {
107 #ifdef USE_PYTORCH_QNNPACK
108 if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8) {
109 return qnnpack_relu(qx);
110 }
111 #endif
112 Tensor qy;
113 qrelu_stub(qx.device().type(), qx, qy);
114 return qy;
115 }
relu_quantized_cpu_(Tensor & qx)116 Tensor& relu_quantized_cpu_(Tensor& qx) {
117 const auto zero_point = qx.q_zero_point();
118 AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qrelu", [&]() {
119 using Vec = Vectorized<scalar_t>;
120 auto iter = TensorIterator::unary_op(qx, qx);
121 auto zero_point_vec = Vec(scalar_t(zero_point));
122 cpu_kernel_vec(
123 iter,
124 [&](scalar_t value) -> scalar_t {
125 return scalar_t(std::max<underlying_t>(value.val_, zero_point));
126 },
127 [&](Vec value) -> Vec { return value.relu(zero_point_vec); });
128 });
129 return qx;
130 }
131
leaky_relu_out_quantized_cpu(const Tensor & self,const Scalar & negval,Tensor & result)132 Tensor& leaky_relu_out_quantized_cpu(const Tensor& self,
133 const Scalar& negval, Tensor& result) {
134 qrelu_leaky_stub(self.device().type(), result, self, negval);
135 return result;
136 }
137
leaky_relu_quantized_cpu(const Tensor & self,const Scalar & negval)138 Tensor leaky_relu_quantized_cpu(const Tensor& self, const Scalar& negval) {
139 const auto qx = self.contiguous(self.suggest_memory_format());
140 auto qy = at::_empty_affine_quantized(qx.sizes(),
141 at::device(kCPU).dtype(self.scalar_type()),
142 qx.q_scale(),
143 qx.q_zero_point(),
144 self.suggest_memory_format());
145 qrelu_leaky_stub(self.device().type(), qy, qx, negval);
146 return qy;
147 }
148
leaky_relu_quantized_cpu_(Tensor & self,const Scalar & negval)149 Tensor& leaky_relu_quantized_cpu_(Tensor& self, const Scalar& negval) {
150 qrelu_leaky_stub(self.device().type(), self, self, negval);
151 return self;
152 }
153
_prelu_kernel_quantized_cpu_impl(const Tensor & self,const Tensor & weight,double output_scale,int64_t output_zero_point)154 static Tensor _prelu_kernel_quantized_cpu_impl(const Tensor& self, const Tensor& weight,
155 double output_scale, int64_t output_zero_point) {
156 auto ndim = self.dim();
157 // for ndim < 1 or > 5, go to reference path
158 if (ndim > 5 || ndim < 1) {
159 auto x = self.dequantize();
160 auto y = at::prelu(x, weight);
161 return at::quantize_per_tensor(y, output_scale, output_zero_point, c10::kQUInt8);
162 }
163
164 auto qy = at::_empty_affine_quantized(self.sizes(),
165 at::device(kCPU)
166 .dtype(self.scalar_type()),
167 output_scale,
168 output_zero_point,
169 self.suggest_memory_format());
170
171 qprelu_stub(self.device().type(), qy, self, weight);
172
173 return qy;
174 }
175
_prelu_kernel_quantized_cpu(const Tensor & self,const Tensor & weight)176 Tensor _prelu_kernel_quantized_cpu(const Tensor& self, const Tensor& weight) {
177 return _prelu_kernel_quantized_cpu_impl(self, weight, self.q_scale(), self.q_zero_point());
178 }
179
180 namespace {
quantized_relu6(const Tensor & qx)181 Tensor quantized_relu6(const Tensor& qx) {
182 Tensor qy;
183 qy = hardtanh_quantized_cpu(qx, 0.0f, 6.0f);
184 return qy;
185 }
186
quantized_relu6_(Tensor & qx)187 Tensor quantized_relu6_(Tensor& qx) {
188 hardtanh_quantized_cpu_(qx, 0.0f, 6.0f);
189 return qx;
190 }
191
192 class QRelu6 final {
193 public:
run(Tensor qx,bool inplace)194 static Tensor run(Tensor qx, bool inplace) {
195 if (inplace) {
196 return quantized_relu6_(qx);
197 } else {
198 return quantized_relu6(qx);
199 }
200 }
201 };
202
203 class QLeakyRelu final {
204 public:
run(Tensor self,const Scalar & negative_slope,bool inplace,double output_scale,int64_t output_zero_point)205 static Tensor run(Tensor self, const Scalar& negative_slope, bool inplace, double output_scale, int64_t output_zero_point) {
206 // inplace argument is ignored now, TODO:support inplace
207 if (inplace) {
208 TORCH_WARN("inplace=True is not supported for quantized::leaky_relu yet");
209 }
210 const auto qx = self.contiguous(self.suggest_memory_format());
211 auto qy = at::_empty_affine_quantized(qx.sizes(),
212 at::device(kCPU).dtype(self.scalar_type()),
213 output_scale,
214 output_zero_point,
215 self.suggest_memory_format());
216 qrelu_leaky_stub(self.device().type(), qy, qx, negative_slope);
217 return qy;
218 }
219 };
220
221 class QPRelu final {
222 public:
run(Tensor self,const Tensor & weight,double output_scale,int64_t output_zero_point)223 static Tensor run(Tensor self, const Tensor& weight, double output_scale, int64_t output_zero_point) {
224 return _prelu_kernel_quantized_cpu_impl(self, weight, output_scale, output_zero_point);
225 }
226 };
227
TORCH_LIBRARY_IMPL(quantized,QuantizedCPU,m)228 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
229 m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run));
230 m.impl(TORCH_SELECTIVE_NAME("quantized::leaky_relu"), TORCH_FN(QLeakyRelu::run));
231 m.impl(TORCH_SELECTIVE_NAME("quantized::prelu"), TORCH_FN(QPRelu::run));
232 }
233
234 } // namespace
235
236 }} // namespace at::native
237