xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <ATen/native/quantized/cpu/QuantizedOps.h>
5 #include <ATen/native/quantized/cpu/init_qnnpack.h>
6 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
7 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
8 
9 #ifndef AT_PER_OPERATOR_HEADERS
10 #include <ATen/Functions.h>
11 #include <ATen/NativeFunctions.h>
12 #else
13 #include <ATen/ops/_empty_affine_quantized.h>
14 #include <ATen/ops/hardsigmoid_native.h>
15 #endif
16 
17 #include <algorithm>
18 
19 namespace at {
20 namespace native {
21 
22 DEFINE_DISPATCH(qhardsigmoid_stub);
23 
24 namespace {
25 
26 #ifdef USE_PYTORCH_QNNPACK
qnnpack_hardsigmoid(Tensor input)27 Tensor qnnpack_hardsigmoid(Tensor input) {
28   TORCH_CHECK(input.ndimension() > 0, "qnnpack_hardsigmoid(): Got empty input tensor");
29   TORCH_CHECK(input.scalar_type() == c10::kQUInt8,
30                 "qnnpack_hardsigmoid(): Expected input data type ",
31                 toString(c10::kQUInt8),
32                 " but got ",
33                 toString(input.scalar_type()));
34   initQNNPACK();
35 
36   Tensor input_contig = input.contiguous(input.suggest_memory_format());
37   size_t num_elems = input_contig.numel() / input_contig.size(0);
38   const auto i_zero_point = input_contig.q_zero_point();
39   const auto i_scale = input_contig.q_scale();
40   constexpr float o_scale = 1.0f / 256.0f;
41   constexpr int32_t o_zero_point = 0;
42 
43   pytorch_qnnp_operator_t hardsigmoid_op{nullptr};
44   const pytorch_qnnp_status createStatus = pytorch_qnnp_create_hardsigmoid_nc_q8(
45     num_elems, // channels
46     i_zero_point,
47     i_scale,
48     o_zero_point,
49     o_scale,
50     std::numeric_limits<uint8_t>::min(), // output min
51     std::numeric_limits<uint8_t>::max(), // output max
52     0, // flags
53     &hardsigmoid_op);
54 
55   std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
56       qnnpack_uniq_ptr(hardsigmoid_op);
57 
58   TORCH_INTERNAL_ASSERT(createStatus == pytorch_qnnp_status_success,
59                         "failed to create QNNPACK Hardsigmoid operator");
60   Tensor qy = at::_empty_affine_quantized(
61     input_contig.sizes(),
62     at::device(kCPU).dtype(input_contig.dtype()),
63     o_scale,
64     o_zero_point,
65     input_contig.suggest_memory_format());
66 
67   const pytorch_qnnp_status setupStatus = pytorch_qnnp_setup_hardsigmoid_nc_q8(
68     hardsigmoid_op,
69     input_contig.size(0), // batch size
70     (uint8_t*)input_contig.data_ptr<c10::quint8>(), // input data
71     num_elems, // input stride
72     (uint8_t*)qy.data_ptr<c10::quint8>(), // output data
73     num_elems); // output stride
74   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
75                         "failed to setup QNNPACK Hardsigmoid operator");
76 
77   pthreadpool_t threadpool = caffe2::pthreadpool_();
78 
79   const pytorch_qnnp_status runStatus =
80     pytorch_qnnp_run_operator(hardsigmoid_op, threadpool);
81 
82   TORCH_INTERNAL_ASSERT(
83     runStatus == pytorch_qnnp_status_success,
84     "failed to run QNNPACK Hardsigmoid operator");
85   return qy;
86 }
87 #endif // USE_PYTORCH_QNNPACK
88 
89 } // namespace
hardsigmoid_quantized_cpu(const Tensor & qx)90 Tensor hardsigmoid_quantized_cpu(const Tensor& qx) {
91 #ifdef USE_PYTORCH_QNNPACK
92   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
93       qx.scalar_type() == kQUInt8) {
94     return qnnpack_hardsigmoid(qx);
95   }
96 #endif  // USE_PYTORCH_QNNPACK
97   Tensor qy;
98   qhardsigmoid_stub(qx.device().type(), qx, qy);
99   return qy;
100 }
101 
hardsigmoid_out_quantized_cpu(const Tensor & qx,Tensor & result)102 Tensor& hardsigmoid_out_quantized_cpu(const Tensor& qx, Tensor& result) {
103   // Note: we create a new temporary tensor because the output of hardsigmoid
104   // usually has different quantization parameters from the input, and
105   // quantization are currently only supported per entire tensor or per entire
106   // channel of a tensor.
107   Tensor qy = hardsigmoid_quantized_cpu(qx);
108   result.copy_(qy);
109   return result;
110 }
111 
112 }}  // namespace at::native
113