xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <c10/util/irange.h>
5 #include <torch/custom_class.h>
6 
7 #include <ATen/native/quantized/cpu/init_qnnpack.h>
8 #include <ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h>
9 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
10 #include <ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h>
11 
12 #ifndef AT_PER_OPERATOR_HEADERS
13 #include <ATen/Functions.h>
14 #else
15 #include <ATen/ops/_empty_affine_quantized.h>
16 #include <ATen/ops/zeros.h>
17 #endif
18 
19 #include <algorithm>
20 
21 namespace ao {
22 namespace sparse {
23 
24 int register_linear_params();
25 
26 #ifdef USE_FBGEMM
27 namespace {
28 // Calculate the column offsets.
29 // Note this includes the sum of the columns as well as the scalar term
30 // B_zero_point * K, whereas the row_offsets created by
31 // packing of activation is only the sum of the A rows.
calc_col_offsets_transpose(int K,int N,const int8_t * Bint8,int32_t * B_zero_point,int32_t * col_offsets,c10::QScheme qtype)32 void calc_col_offsets_transpose(
33     int K,
34     int N,
35     const int8_t* Bint8,
36     int32_t* B_zero_point,
37     int32_t* col_offsets,
38     c10::QScheme qtype) {
39   for (const auto i : c10::irange(N)) {
40     int32_t sum = 0;
41     for (const auto j : c10::irange(K)) {
42       sum += Bint8[i * K + j];
43     }
44     if (qtype == c10::kPerTensorAffine) {
45       col_offsets[i] = sum - B_zero_point[0] * K;
46     } else {
47       col_offsets[i] = sum - B_zero_point[i] * K;
48     }
49   }
50 }
51 } // namespace
52 
53 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::
prepack(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)54     prepack(
55         const at::Tensor& weight,
56         const std::optional<at::Tensor>& bias,
57         const int64_t out_features_block_size,
58         const int64_t in_features_block_size) {
59   TORCH_CHECK(
60       weight.dim() == 2,
61       "The weight tensor for ao::sparse::qlinear_prepack (fbgemm) should"
62       " be 2-dimensional.");
63 
64   TORCH_CHECK(
65       out_features_block_size == 1 && in_features_block_size == 4,
66       "The out and in features block sizes for ao::sparse::qlinear_prepack",
67       " (fbgemm) should be 1 and 4 respectively (got ", out_features_block_size,
68       " and ", in_features_block_size, ")");
69 
70   auto N = weight.size(0);
71   auto K = weight.size(1);
72 
73   auto weight_contig = weight.contiguous();
74   const auto qtype = weight.qscheme();
75   std::vector<int32_t> weight_zero_points_int32(1, 0);
76   if (qtype == c10::kPerTensorAffine) {
77     weight_zero_points_int32[0] = weight.q_zero_point();
78   } else if (qtype == c10::kPerChannelAffine) {
79     weight_zero_points_int32.resize(N, 0);
80     for (const auto i : c10::irange(N)) {
81       weight_zero_points_int32[i] =
82           weight.q_per_channel_zero_points()[i].item<int32_t>();
83     }
84   }
85   TORCH_CHECK(
86       std::all_of(
87           weight_zero_points_int32.cbegin(),
88           weight_zero_points_int32.cend(),
89           [](int32_t i) { return i == 0; }),
90       "zero point(s) should be 0 for the weight tensor of ao::sparse::qlinear op");
91   std::vector<float> weight_scales_float(1, 0.0);
92   if (qtype == c10::kPerTensorAffine) {
93     weight_scales_float[0] = weight.q_scale();
94   } else if (qtype == c10::kPerChannelAffine) {
95     weight_scales_float.resize(N, 0.0);
96     for (const auto i : c10::irange(N)) {
97       weight_scales_float[i] = weight.q_per_channel_scales()[i].item<float>();
98     }
99   }
100 
101   int8_t* weight_ptr_int8 =
102       reinterpret_cast<int8_t*>(weight_contig.data_ptr<c10::qint8>());
103 
104   std::vector<int32_t> col_offsets(N);
105   calc_col_offsets_transpose(
106       /*K=*/K,
107       /*N=*/N,
108       /*Bint8=*/weight_ptr_int8,
109       /*B_zero_point=*/weight_zero_points_int32.data(),
110       /*col_offsets=*/col_offsets.data(),
111       /*qtype=*/qtype);
112 
113   std::optional<at::Tensor> bias_contig;
114   if (bias.has_value()) {
115     const at::Tensor& bias_vec = bias.value();
116     TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
117     TORCH_CHECK(
118         bias_vec.size(0) == N,
119         "bias should have N elements: " + std::to_string(N));
120     bias_contig = bias->contiguous();
121   }
122 
123   auto bcsr = fbgemm::fbgemmDenseToBCSR<int8_t>(N, K, weight_ptr_int8);
124   auto ret_ptr = c10::make_intrusive<PackedLinearWeight>(
125       std::move(bcsr),
126       bias_contig,
127       col_offsets,
128       weight_scales_float,
129       weight_zero_points_int32,
130       qtype,
131       out_features_block_size,
132       in_features_block_size);
133   return ret_ptr;
134 }
135 
136 #endif // USE_FBGEMM
137 
138 #ifdef USE_PYTORCH_QNNPACK
139 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightQnnp::
prepack(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)140     prepack(
141         const at::Tensor& weight,
142         const std::optional<at::Tensor>& bias,
143         const int64_t out_features_block_size,
144         const int64_t in_features_block_size) {
145   at::native::initQNNPACK();
146   return c10::make_intrusive<PackedLinearWeightQnnp>(
147       weight, bias, out_features_block_size, in_features_block_size);
148 }
149 
150 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
PackedLinearWeightQnnp(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)151 PackedLinearWeightQnnp::PackedLinearWeightQnnp(
152     const at::Tensor& weight,
153     const std::optional<at::Tensor>& bias,
154     const int64_t out_features_block_size,
155     const int64_t in_features_block_size)
156     : LinearPackedParamsBase(out_features_block_size, in_features_block_size),
157       orig_bias_(bias),
158       q_scheme_(weight.qscheme()),
159       output_channels_(weight.size(0)),
160       input_channels_(weight.size(1)) {
161   TORCH_CHECK(
162       weight.dim() == 2,
163       "ao::sparse::qlinear (qnnpack): Weight tensor rank should be == 2");
164   TORCH_CHECK(out_features_block_size > 0, "Row block size must be > 0.");
165   TORCH_CHECK(in_features_block_size > 0, "Row block size must be > 0.");
166 
167   if (bias.has_value()) {
168     bias_ = bias.value();
169   } else {
170     bias_ = at::zeros(output_channels_, weight.options().dtype(at::kFloat));
171   }
172   TORCH_CHECK(
173       (bias_.ndimension() == 1 && bias_.size(0) == output_channels_),
174       "ao::sparse::qlinear_prepack (qnnpack): Given weight of size ",
175       weight.sizes(),
176       ", expected bias to be 1-dimensional with ",
177       output_channels_,
178       " elements",
179       ", but got bias of size ",
180       bias_.sizes(),
181       " instead");
182 
183   // Given bias is supposed to be 1 dim, it is already contiguous,
184   // but the weight might be non-contiguous.
185   at::Tensor weight_contig = weight.contiguous();
186 
187   std::tie(w_zero_points_, w_scales_) =
188       make_zero_points_and_scales_tensor(weight_contig);
189   const float* weight_scales_data = w_scales_.const_data_ptr<float>();
190   at::Tensor qnnp_weight = at::_empty_affine_quantized(
191       weight_contig.sizes(),
192       at::device(c10::kCPU).dtype(c10::kQUInt8),
193       weight_scales_data[0],
194       w_zero_points_[0]);
195   auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
196   auto wt_numel = weight_contig.numel();
197   int8_t* w_data =
198       reinterpret_cast<int8_t*>(weight_contig.data_ptr<c10::qint8>());
199   for (const auto i : c10::irange(wt_numel)) {
200     qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
201   }
202   bcsr_matrix_ = qnnpack::generateBlockCSRMatrix<uint32_t>(
203       reinterpret_cast<uint8_t*>(qnnp_w_data),
204       output_channels_,
205       input_channels_,
206       out_features_block_size,
207       in_features_block_size,
208       w_zero_points_.data());
209 }
210 #endif // USE_PYTORCH_QNNPACK
211 
212 namespace {
213 
214 class QLinearPackWeightInt8 final {
215  public:
run(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)216   static c10::intrusive_ptr<LinearPackedParamsBase> run(
217       const at::Tensor& weight,
218       const std::optional<at::Tensor>& bias,
219       const int64_t out_features_block_size,
220       const int64_t in_features_block_size) {
221     auto& ctx = at::globalContext();
222 
223 #ifdef USE_FBGEMM
224     if (ctx.qEngine() == at::QEngine::FBGEMM) {
225       return PackedLinearWeight::prepack(
226           weight, bias, out_features_block_size, in_features_block_size);
227     }
228 #endif
229 #ifdef USE_PYTORCH_QNNPACK
230     if (ctx.qEngine() == at::QEngine::QNNPACK) {
231       return PackedLinearWeightQnnp::prepack(
232           weight, bias, out_features_block_size, in_features_block_size);
233     }
234 #endif
235     TORCH_CHECK(
236         false,
237         "Didn't find engine for operation ao::sparse::qlinear_prepack ",
238         toString(ctx.qEngine()));
239   }
240 };
241 
TORCH_LIBRARY_IMPL(sparse,QuantizedCPU,m)242 TORCH_LIBRARY_IMPL(sparse, QuantizedCPU, m) {
243   register_linear_params();
244   m.impl(
245       TORCH_SELECTIVE_NAME("sparse::qlinear_prepack"),
246       TORCH_FN(QLinearPackWeightInt8::run));
247 }
248 }  // namespace
249 }}  // namespace ao::sparse
250