xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/Dispatch.h>
5 #include <ATen/ExpandUtils.h>
6 #include <torch/library.h>
7 #include <ATen/quantized/Quantizer.h>
8 #include <ATen/native/quantized/cpu/BinaryOps.h>
9 #include <ATen/native/quantized/cpu/QuantizedOps.h>
10 #include <ATen/native/quantized/cpu/init_qnnpack.h>
11 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
12 #include <ATen/native/quantized/cpu/XnnpackUtils.h>
13 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
14 
15 #ifndef AT_PER_OPERATOR_HEADERS
16 #include <ATen/Functions.h>
17 #include <ATen/NativeFunctions.h>
18 #else
19 #include <ATen/ops/_empty_affine_quantized.h>
20 #include <ATen/ops/_empty_affine_quantized_native.h>
21 #include <ATen/ops/empty_like.h>
22 #include <ATen/ops/relu_native.h>
23 #endif
24 
25 #include <algorithm>
26 #include <utility>
27 
28 namespace at {
29 namespace native {
30 
31 DEFINE_DISPATCH(qadd_relu_stub);
32 DEFINE_DISPATCH(qadd_stub);
33 DEFINE_DISPATCH(qadd_scalar_relu_stub);
34 DEFINE_DISPATCH(qadd_scalar_stub);
35 
36 namespace {
37 
check_inputs(const Tensor & qa,const Tensor & qb)38 inline void check_inputs(const Tensor& qa, const Tensor& qb) {
39   TORCH_CHECK(
40       qa.qscheme() == kPerTensorAffine,
41       "Only per tensor quantization is supported in Add.");
42   TORCH_CHECK(
43       qa.qscheme() == qb.qscheme(),
44       "Both inputs to Add must have the same quantization scheme.");
45   TORCH_CHECK(
46       qa.scalar_type() == qb.scalar_type(),
47       "Add operands should have same data type.");
48 }
49 
50 // Note: out is assumed to be the same size as self and other.
51 // Note: Addition is only supported when self, other, out are of the same dtype.
52 template <bool ReLUFused = false>
_add_out(Tensor & out,const Tensor & self,const Tensor & other)53 Tensor _add_out(Tensor& out, const Tensor& self, const Tensor& other) {
54   if (ReLUFused) {
55     qadd_relu_stub(self.device().type(), out, self, other);
56   } else {
57     qadd_stub(self.device().type(), out, self, other);
58   }
59   return out;
60 }
61 
62 template <bool ReLUFused = false>
_add_scalar_out(Tensor & out,const Tensor & self,const Scalar & other)63 Tensor _add_scalar_out(Tensor& out, const Tensor& self, const Scalar& other) {
64   TORCH_CHECK(
65       self.qscheme() == kPerTensorAffine,
66       "Only per tensor affine is supported for now!!");
67   // To implement tensor-scalar addition in quantized space, we simply
68   // adjust the quantization parameters based on the following rules:
69   //
70   // Let s = scale, z = zero point, c = other.toFloat(), c_q = round(c/s)
71   // q_min = lowest representable value of scalar type
72   // q_max = highest representable value of scalar type
73   //
74   // Let s' = the calculated scale or the output
75   // z' = the calculated zero-point for the output
76   //
77   // If q_min > z - c_q
78   //   s' = [(q_max - (z - c_q)]/[q_max - q_min] * s
79   //   z' = q_min
80   //   Xq' = at::requantize_from_int(Xq - z + c_q, s/s', z')
81   // If q_max < z - c_q
82   //   s' = [z - c_q -q_min]/[q_max - q_min] * s
83   //   z' = q_max
84   //   Xq' = at::requantize_from_int(Xq - z + c_q, s/s', z')
85   // Else
86   //   s' = s
87   //   z' = z - c_q
88 
89   AT_DISPATCH_QINT_TYPES(self.scalar_type(), "qadd_scalar", [&]() {
90     double s = self.q_scale();
91     int64_t z = self.q_zero_point();
92     double c = other.toDouble();
93     // NOLINTNEXTLINE(bugprone-signed-char-misuse)
94     int64_t q_min = std::numeric_limits<underlying_t>::min();
95     int64_t q_max = std::numeric_limits<underlying_t>::max();
96 
97     int64_t c_q = std::nearbyint(c / s);
98 
99     double s_prime;
100     int64_t z_prime;
101 
102     if (q_min > z - c_q) {
103       s_prime = (((double)q_max - (z - c_q))) / ((double)q_max - q_min) * s;
104       z_prime = q_min;
105       set_quantizer_(out, make_per_tensor_affine_quantizer(
106           s_prime, z_prime, self.scalar_type()));
107       if (ReLUFused) {
108         qadd_scalar_relu_stub(self.device().type(), out, self, c_q);
109       } else {
110         qadd_scalar_stub(self.device().type(), out, self, c_q);
111       }
112     } else if (q_max < z - c_q) {
113       s_prime = ((double)(z - c_q) - q_min) / ((double)q_max - q_min) * s;
114       z_prime = q_max;
115       set_quantizer_(out, make_per_tensor_affine_quantizer(
116           s_prime, z_prime, self.scalar_type()));
117       if (ReLUFused) {
118         qadd_scalar_relu_stub(self.device().type(), out, self, c_q);
119       } else {
120         qadd_scalar_stub(self.device().type(), out, self, c_q);
121       }
122     } else {
123       s_prime = s;
124       z_prime = z - c_q;
125       out.copy_(self);
126       set_quantizer_(out, make_per_tensor_affine_quantizer(
127           s_prime, z_prime, self.scalar_type()));
128       if (ReLUFused) {
129         at::native::relu_quantized_cpu_(out);
130       }
131     }
132   });
133   return out;
134 }
135 
136 
137 #ifdef USE_PYTORCH_QNNPACK
138 template <bool ReLUFused = false>
qnnpack_add(Tensor qa,Tensor qb,double scale,int64_t zero_point)139 Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
140   TORCH_CHECK(qa.ndimension() > 0, "qnnpack_add(): Got empty input tensor.");
141   TORCH_CHECK(qa.scalar_type() == c10::kQUInt8 && qb.scalar_type() == c10::kQUInt8,
142                 "qnnpack_add(): Expected both input data types to be ",
143                 toString(c10::kQUInt8),
144                 " but got ",
145                 toString(qa.scalar_type()),
146                 " and ",
147                 toString(qb.scalar_type()));
148   Tensor qa_contig = qa.contiguous(qa.suggest_memory_format());
149   // Reason for use qa's memory format for qb is that for the underlying
150   // kernel can flatten all the dims and iterate over both the tensors.
151   // In most cases, both qa and qb are in same memory format.
152   // When they are not there is a copy overhead to make it contiguous
153   // in qa's memory format.
154   Tensor qb_contig = qb.contiguous(qa.suggest_memory_format());
155 
156   const auto a_zero_point = qa_contig.q_zero_point();
157   const auto b_zero_point = qb_contig.q_zero_point();
158   const auto a_scale = qa_contig.q_scale();
159   const auto b_scale = qb_contig.q_scale();
160 
161   Tensor qy = at::native::empty_affine_quantized(
162       qa_contig.sizes(),
163       kQUInt8,
164       std::nullopt /* layout */,
165       kCPU,
166       std::nullopt /* pin_memory */,
167       scale,
168       zero_point,
169       qa.suggest_memory_format());
170 
171   if (qa_contig.size(0) == 0) {
172     return qy;
173   }
174 
175   initQNNPACK();
176 
177   pytorch_qnnp_operator_t qnnpack_operator{nullptr};
178 
179   size_t num_elems = qa_contig.numel() / qa_contig.size(0);
180   auto output_min = ReLUFused
181       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
182       ? activationLimits<uint8_t>(scale, zero_point, Activation::RELU)
183             .first
184       : std::numeric_limits<uint8_t>::min();
185   auto output_max = ReLUFused
186       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
187       ? activationLimits<uint8_t>(scale, zero_point, Activation::RELU)
188             .second
189       : std::numeric_limits<uint8_t>::max();
190   const pytorch_qnnp_status createStatus = pytorch_qnnp_create_add_nc_q8(
191       num_elems /* input size */,
192       a_zero_point /* a zero_point */,
193       a_scale /* a scale */,
194       b_zero_point /* b zero_point */,
195       b_scale /* b scale */,
196       static_cast<uint8_t>(zero_point) /* sum zero_point */,
197       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
198       scale /* sum scale */,
199       output_min /* output min */,
200       output_max /* output max */,
201       0 /* flags */,
202       &qnnpack_operator);
203 
204   TORCH_INTERNAL_ASSERT(
205       createStatus == pytorch_qnnp_status_success,
206       "failed to create QNNPACK Add operator");
207 
208   std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
209       qnnpack_uniq_ptr(qnnpack_operator);
210 
211   const pytorch_qnnp_status setupStatus = pytorch_qnnp_setup_add_nc_q8(
212       qnnpack_operator /* add op */,
213       qa_contig.size(0) /* batch size */,
214       (uint8_t*)qa_contig.data_ptr<c10::quint8>() /* a data */,
215       num_elems /* A stride */,
216       (uint8_t*)qb_contig.data_ptr<c10::quint8>() /* b data */,
217       num_elems /* B stride */,
218       (uint8_t*)qy.data_ptr<c10::quint8>() /* output data */,
219       num_elems /* sum stride */);
220   TORCH_INTERNAL_ASSERT(
221       setupStatus == pytorch_qnnp_status_success,
222       "failed to setup QNNPACK Add operator");
223 
224   pthreadpool_t threadpool = caffe2::pthreadpool_();
225   const pytorch_qnnp_status runStatus =
226       pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
227 
228   TORCH_INTERNAL_ASSERT(
229       runStatus == pytorch_qnnp_status_success,
230       "failed to run QNNPACK Add operator");
231 
232   return qy;
233 }
234 #endif // USE_PYTORCH_QNNPACK
235 
236 #ifdef USE_XNNPACK
237 C10_ALWAYS_INLINE
xnnp_create_add_nd(int8_t azp,float ascale,int8_t bzp,float bscale,int8_t czp,float cscale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_operator_t * op)238 enum xnn_status xnnp_create_add_nd(
239     int8_t azp,
240     float ascale,
241     int8_t bzp,
242     float bscale,
243     int8_t czp,
244     float cscale,
245     int8_t output_min,
246     int8_t output_max,
247     uint32_t flags,
248     xnn_operator_t* op) {
249   return xnn_create_add_nd_qs8(
250       azp,        /* int8_t input1_zero_point   */
251       ascale,     /* float input1_scale         */
252       bzp,        /* int8_t input2_zero_point   */
253       bscale,     /* float input2_scale         */
254       czp,        /* int8_t output_zero_point   */
255       cscale,     /* float output_scale         */
256       output_min, /* int8_t output_min          */
257       output_max, /* int8_t output_max          */
258       flags,      /* uint32_t flags             */
259       op);        /* xnn_operator_t* add_op_out */
260 }
261 
262 C10_ALWAYS_INLINE
xnnp_reshape_add_nd(xnn_operator_t op,const std::vector<size_t> & a_shape,const std::vector<size_t> & b_shape,pthreadpool_t pt_pool)263 enum xnn_status xnnp_reshape_add_nd(
264     xnn_operator_t op,
265     const std::vector<size_t>& a_shape,
266     const std::vector<size_t>& b_shape,
267     pthreadpool_t pt_pool) {
268   return xnn_reshape_add_nd_qs8(
269       op,             /* xnn_operator_t add_op      */
270       a_shape.size(), /* size_t num_input1_dims     */
271       a_shape.data(), /* const size_t* input1_shape */
272       b_shape.size(), /* size_t num_input2_dims     */
273       b_shape.data(), /* const size_t* input2_shape */
274       pt_pool);       /* pthreadpool_t threadpool   */
275 }
276 
277 C10_ALWAYS_INLINE
xnnp_setup_add_nd(xnn_operator_t op,const int8_t * da,const int8_t * db,int8_t * dc,pthreadpool_t pt_pool)278 enum xnn_status xnnp_setup_add_nd(
279     xnn_operator_t op,
280     const int8_t* da,
281     const int8_t* db,
282     int8_t* dc,
283     pthreadpool_t pt_pool) {
284   return xnn_setup_add_nd_qs8(
285       op,             /* xnn_operator_t add_op      */
286       da,             /* const int8_t* input1       */
287       db,             /* const int8_t* input2       */
288       dc);            /* int8_t* output             */
289 }
290 
291 template <typename scalar_t, bool ReLUFused = false>
xnnp_add(Tensor qa,Tensor qb,double scale,int64_t zero_point)292 Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
293   using underlying_t = typename scalar_t::underlying;
294   const string func_name = "xnnp_add()";
295   TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
296   TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
297 
298   // using qa memory format for qb to allow xnnpack kernel to flatten all the
299   // dims
300   auto qa_mem_format = qa.suggest_memory_format();
301   Tensor qa_contig = qa.contiguous(qa_mem_format);
302   Tensor qb_contig = qb.contiguous(qa_mem_format);
303 
304   const auto a_zero_point = qa_contig.q_zero_point();
305   const auto b_zero_point = qb_contig.q_zero_point();
306   const auto a_scale = qa_contig.q_scale();
307   const auto b_scale = qb_contig.q_scale();
308 
309   Tensor qy = at::native::empty_affine_quantized(
310       at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
311       qa.scalar_type(),
312       std::nullopt /* layout */,
313       kCPU,
314       std::nullopt /* pin_memory */,
315       scale,
316       zero_point,
317       qa_mem_format);
318 
319   if (qa_contig.size(0) == 0) {
320     return qy;
321   }
322 
323   xnn_operator_t xnnp_op = nullptr;
324   xnnpack_operator xnnp_add_operator;
325 
326   auto output_max = std::numeric_limits<underlying_t>::max();
327   auto output_min = std::numeric_limits<underlying_t>::min();
328   if (ReLUFused) {
329     /*
330      * FIXME: use activationLimits<T>()
331      * With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
332      */
333     constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
334     constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
335     int64_t qvalue = static_cast<int64_t>(zero_point);
336     qvalue = std::max<int64_t>(qvalue, qmin);
337     output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
338   }
339 
340   // Create an operator
341   auto status = xnnp_create_add_nd(
342       a_zero_point,
343       a_scale,
344       b_zero_point,
345       b_scale,
346       static_cast<underlying_t>(zero_point),
347       static_cast<float>(scale),
348       output_min,
349       output_max,
350       0,
351       &xnnp_op);
352   xnnp_add_operator = xnnpack_operator(xnnp_op);
353   TORCH_CHECK(
354       status == xnn_status_success,
355       func_name, ": xnn create operator failed(", status,")!");
356 
357   const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
358   const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
359 
360   // Reshape the operator
361   status = xnnp_reshape_add_nd(
362       xnnp_add_operator.get(),
363       qa_shape,
364       qb_shape,
365       caffe2::pthreadpool_());
366 
367   TORCH_CHECK(
368       status == xnn_status_success,
369       func_name, ": xnn reshape operator failed(", status,")!");
370 
371   // Setup the operator
372   status = xnnp_setup_add_nd(
373       xnnp_add_operator.get(),
374       reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
375       reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
376       reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
377       caffe2::pthreadpool_());
378   TORCH_CHECK(
379       status == xnn_status_success,
380       func_name, ": xnn setup operator failed(", status,")!");
381 
382   // Run the operator
383   status = xnn_run_operator(
384       xnnp_add_operator.get(), /* xnn_operator_t op */
385       caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
386   TORCH_CHECK(
387       status == xnn_status_success,
388       func_name, ": xnn run operator failed(", status,")");
389   return qy;
390 }
391 #endif // USE_XNNPACK
392 
393 template <bool ReLUFused = false>
qadd(Tensor qa,Tensor qb,double scale,int64_t zero_point)394 Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
395   check_inputs(qa, qb);
396 
397   if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
398     TORCH_CHECK(
399         qa.scalar_type() == qb.scalar_type(),
400         "Both inputs to qadd must have same type");
401 
402 #ifdef USE_XNNPACK
403     if (qa.scalar_type() == kQInt8) {
404           return xnnp_add<c10::qint8, ReLUFused>(qa, qb, scale, zero_point);
405     }
406 #endif // USE_XNNPACK
407 
408 #ifdef USE_PYTORCH_QNNPACK
409     if(qa.sizes() == qb.sizes() && /* qnnpack does not support boardcasting */
410       qa.scalar_type() == kQUInt8) {
411     return qnnpack_add<ReLUFused>(qa, qb, scale, zero_point);
412     }
413 #endif // USE_PYTORCH_QNNPACK
414   }
415   auto qc = at::_empty_affine_quantized(
416       qa.sizes(),
417       at::device(kCPU)
418          .dtype(qa.scalar_type())
419          .memory_format(qa.suggest_memory_format()),
420       scale,
421       zero_point,
422       std::nullopt);
423   return _add_out<ReLUFused>(qc, qa, qb);
424 }
425 
426 template <bool ReLUFused = false>
qadd_out(Tensor qa,Tensor qb,Tensor out)427 Tensor qadd_out(Tensor qa, Tensor qb, Tensor out) {
428   check_inputs(qa, qb);
429   check_inputs(qa, out);
430   return _add_out<ReLUFused>(out, qa, qb);
431 }
432 
433 
434 template <bool ReLUFused = false>
qadd_scalar(Tensor qa,const Scalar & b)435 Tensor qadd_scalar(Tensor qa, const Scalar& b) {
436   TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
437               qa.qscheme() == kPerTensorSymmetric,
438               "Only per tensor quantization is supported in Add.");
439   auto qc = at::empty_like(qa, qa.suggest_memory_format());
440   return _add_scalar_out<ReLUFused>(qc, qa, b);
441 }
442 
443 template <bool ReLUFused = false>
qadd_scalar2(Scalar b,Tensor qa)444 Tensor qadd_scalar2(Scalar b, Tensor qa) {
445   TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
446               qa.qscheme() == kPerTensorSymmetric,
447               "Only per tensor quantization is supported in Add.");
448   auto qc = at::empty_like(qa, qa.suggest_memory_format());
449   return _add_scalar_out<ReLUFused>(qc, qa, b);
450 }
451 
452 template <bool ReLUFused = false>
qadd_scalar_out(Tensor qa,const Scalar & b,Tensor out)453 Tensor qadd_scalar_out(Tensor qa, const Scalar& b, Tensor out) {
454   check_inputs(qa, out);
455   return _add_scalar_out<ReLUFused>(out, qa, b);
456 }
457 
458 // `torch.jit.trace` will trace Scalar as Tensor
459 // This can be removed after broadcast is supported and
460 // all variations of `quantized::add` is merged into `quantized::add`
461 template <bool ReLUFused = false>
qadd_scalar_tensor(Tensor qa,Tensor b)462 Tensor qadd_scalar_tensor(Tensor qa, Tensor b) {
463   return qadd_scalar(std::move(qa), b.item());
464 }
465 
466 // `torch.jit.trace` will trace Scalar as Tensor
467 // This can be removed after broadcast is supported and
468 // all variations of `quantized::add` is merged into `quantized::add`
469 template <bool ReLUFused = false>
qadd_scalar_tensor_out(Tensor qa,Tensor b,Tensor out)470 Tensor qadd_scalar_tensor_out(Tensor qa, Tensor b, Tensor out) {
471   return qadd_scalar_out(std::move(qa), b.item(), std::move(out));
472 }
473 
TORCH_LIBRARY_IMPL(quantized,QuantizedCPU,m)474 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
475   m.impl(TORCH_SELECTIVE_NAME("quantized::add"),                 TORCH_FN(qadd</*ReLUFused=*/false>));
476   m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
477   m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
478   m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar2"),          TORCH_FN(qadd_scalar2</*ReLUFused=*/false>));
479   m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
480   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"),            TORCH_FN(qadd</*ReLUFused=*/true>));
481   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
482   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
483   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar2"),     TORCH_FN(qadd_scalar2</*ReLUFused=*/true>));
484   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
485   // deprecated functions, kept for backward compatibility
486   m.impl(TORCH_SELECTIVE_NAME("quantized::add_out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
487   m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu_out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
488   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
489   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
490   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
491   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
492   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar.Tensor"),   TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/false>));
493   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu.Tensor"), TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/true>));
494   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/false>));
495   m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/true>));
496 }
497 
TORCH_LIBRARY_IMPL(_quantized,QuantizedCPU,m)498 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
499   m.impl(TORCH_SELECTIVE_NAME("_quantized::add"), TORCH_FN(qadd</*ReLUFused=*/false>));
500 }
501 
502 }  // namespace
503 
quantized_add(Tensor qa,Tensor qb,double scale,int64_t zero_point)504 Tensor quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point){
505   return qadd<false>(std::move(qa), std::move(qb), scale, zero_point);
506 }
507 
508 }}  // namespace at::native
509