1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Context.h>
4 #include <c10/util/irange.h>
5 #include <torch/custom_class.h>
6
7 #include <ATen/native/quantized/cpu/init_qnnpack.h>
8 #include <ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h>
9 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
10 #include <ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h>
11
12 #ifndef AT_PER_OPERATOR_HEADERS
13 #include <ATen/Functions.h>
14 #else
15 #include <ATen/ops/_empty_affine_quantized.h>
16 #include <ATen/ops/zeros.h>
17 #endif
18
19 #include <algorithm>
20
21 namespace ao {
22 namespace sparse {
23
24 int register_linear_params();
25
26 #ifdef USE_FBGEMM
27 namespace {
28 // Calculate the column offsets.
29 // Note this includes the sum of the columns as well as the scalar term
30 // B_zero_point * K, whereas the row_offsets created by
31 // packing of activation is only the sum of the A rows.
calc_col_offsets_transpose(int K,int N,const int8_t * Bint8,int32_t * B_zero_point,int32_t * col_offsets,c10::QScheme qtype)32 void calc_col_offsets_transpose(
33 int K,
34 int N,
35 const int8_t* Bint8,
36 int32_t* B_zero_point,
37 int32_t* col_offsets,
38 c10::QScheme qtype) {
39 for (const auto i : c10::irange(N)) {
40 int32_t sum = 0;
41 for (const auto j : c10::irange(K)) {
42 sum += Bint8[i * K + j];
43 }
44 if (qtype == c10::kPerTensorAffine) {
45 col_offsets[i] = sum - B_zero_point[0] * K;
46 } else {
47 col_offsets[i] = sum - B_zero_point[i] * K;
48 }
49 }
50 }
51 } // namespace
52
53 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::
prepack(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)54 prepack(
55 const at::Tensor& weight,
56 const std::optional<at::Tensor>& bias,
57 const int64_t out_features_block_size,
58 const int64_t in_features_block_size) {
59 TORCH_CHECK(
60 weight.dim() == 2,
61 "The weight tensor for ao::sparse::qlinear_prepack (fbgemm) should"
62 " be 2-dimensional.");
63
64 TORCH_CHECK(
65 out_features_block_size == 1 && in_features_block_size == 4,
66 "The out and in features block sizes for ao::sparse::qlinear_prepack",
67 " (fbgemm) should be 1 and 4 respectively (got ", out_features_block_size,
68 " and ", in_features_block_size, ")");
69
70 auto N = weight.size(0);
71 auto K = weight.size(1);
72
73 auto weight_contig = weight.contiguous();
74 const auto qtype = weight.qscheme();
75 std::vector<int32_t> weight_zero_points_int32(1, 0);
76 if (qtype == c10::kPerTensorAffine) {
77 weight_zero_points_int32[0] = weight.q_zero_point();
78 } else if (qtype == c10::kPerChannelAffine) {
79 weight_zero_points_int32.resize(N, 0);
80 for (const auto i : c10::irange(N)) {
81 weight_zero_points_int32[i] =
82 weight.q_per_channel_zero_points()[i].item<int32_t>();
83 }
84 }
85 TORCH_CHECK(
86 std::all_of(
87 weight_zero_points_int32.cbegin(),
88 weight_zero_points_int32.cend(),
89 [](int32_t i) { return i == 0; }),
90 "zero point(s) should be 0 for the weight tensor of ao::sparse::qlinear op");
91 std::vector<float> weight_scales_float(1, 0.0);
92 if (qtype == c10::kPerTensorAffine) {
93 weight_scales_float[0] = weight.q_scale();
94 } else if (qtype == c10::kPerChannelAffine) {
95 weight_scales_float.resize(N, 0.0);
96 for (const auto i : c10::irange(N)) {
97 weight_scales_float[i] = weight.q_per_channel_scales()[i].item<float>();
98 }
99 }
100
101 int8_t* weight_ptr_int8 =
102 reinterpret_cast<int8_t*>(weight_contig.data_ptr<c10::qint8>());
103
104 std::vector<int32_t> col_offsets(N);
105 calc_col_offsets_transpose(
106 /*K=*/K,
107 /*N=*/N,
108 /*Bint8=*/weight_ptr_int8,
109 /*B_zero_point=*/weight_zero_points_int32.data(),
110 /*col_offsets=*/col_offsets.data(),
111 /*qtype=*/qtype);
112
113 std::optional<at::Tensor> bias_contig;
114 if (bias.has_value()) {
115 const at::Tensor& bias_vec = bias.value();
116 TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
117 TORCH_CHECK(
118 bias_vec.size(0) == N,
119 "bias should have N elements: " + std::to_string(N));
120 bias_contig = bias->contiguous();
121 }
122
123 auto bcsr = fbgemm::fbgemmDenseToBCSR<int8_t>(N, K, weight_ptr_int8);
124 auto ret_ptr = c10::make_intrusive<PackedLinearWeight>(
125 std::move(bcsr),
126 bias_contig,
127 col_offsets,
128 weight_scales_float,
129 weight_zero_points_int32,
130 qtype,
131 out_features_block_size,
132 in_features_block_size);
133 return ret_ptr;
134 }
135
136 #endif // USE_FBGEMM
137
138 #ifdef USE_PYTORCH_QNNPACK
139 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightQnnp::
prepack(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)140 prepack(
141 const at::Tensor& weight,
142 const std::optional<at::Tensor>& bias,
143 const int64_t out_features_block_size,
144 const int64_t in_features_block_size) {
145 at::native::initQNNPACK();
146 return c10::make_intrusive<PackedLinearWeightQnnp>(
147 weight, bias, out_features_block_size, in_features_block_size);
148 }
149
150 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
PackedLinearWeightQnnp(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)151 PackedLinearWeightQnnp::PackedLinearWeightQnnp(
152 const at::Tensor& weight,
153 const std::optional<at::Tensor>& bias,
154 const int64_t out_features_block_size,
155 const int64_t in_features_block_size)
156 : LinearPackedParamsBase(out_features_block_size, in_features_block_size),
157 orig_bias_(bias),
158 q_scheme_(weight.qscheme()),
159 output_channels_(weight.size(0)),
160 input_channels_(weight.size(1)) {
161 TORCH_CHECK(
162 weight.dim() == 2,
163 "ao::sparse::qlinear (qnnpack): Weight tensor rank should be == 2");
164 TORCH_CHECK(out_features_block_size > 0, "Row block size must be > 0.");
165 TORCH_CHECK(in_features_block_size > 0, "Row block size must be > 0.");
166
167 if (bias.has_value()) {
168 bias_ = bias.value();
169 } else {
170 bias_ = at::zeros(output_channels_, weight.options().dtype(at::kFloat));
171 }
172 TORCH_CHECK(
173 (bias_.ndimension() == 1 && bias_.size(0) == output_channels_),
174 "ao::sparse::qlinear_prepack (qnnpack): Given weight of size ",
175 weight.sizes(),
176 ", expected bias to be 1-dimensional with ",
177 output_channels_,
178 " elements",
179 ", but got bias of size ",
180 bias_.sizes(),
181 " instead");
182
183 // Given bias is supposed to be 1 dim, it is already contiguous,
184 // but the weight might be non-contiguous.
185 at::Tensor weight_contig = weight.contiguous();
186
187 std::tie(w_zero_points_, w_scales_) =
188 make_zero_points_and_scales_tensor(weight_contig);
189 const float* weight_scales_data = w_scales_.const_data_ptr<float>();
190 at::Tensor qnnp_weight = at::_empty_affine_quantized(
191 weight_contig.sizes(),
192 at::device(c10::kCPU).dtype(c10::kQUInt8),
193 weight_scales_data[0],
194 w_zero_points_[0]);
195 auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
196 auto wt_numel = weight_contig.numel();
197 int8_t* w_data =
198 reinterpret_cast<int8_t*>(weight_contig.data_ptr<c10::qint8>());
199 for (const auto i : c10::irange(wt_numel)) {
200 qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
201 }
202 bcsr_matrix_ = qnnpack::generateBlockCSRMatrix<uint32_t>(
203 reinterpret_cast<uint8_t*>(qnnp_w_data),
204 output_channels_,
205 input_channels_,
206 out_features_block_size,
207 in_features_block_size,
208 w_zero_points_.data());
209 }
210 #endif // USE_PYTORCH_QNNPACK
211
212 namespace {
213
214 class QLinearPackWeightInt8 final {
215 public:
run(const at::Tensor & weight,const std::optional<at::Tensor> & bias,const int64_t out_features_block_size,const int64_t in_features_block_size)216 static c10::intrusive_ptr<LinearPackedParamsBase> run(
217 const at::Tensor& weight,
218 const std::optional<at::Tensor>& bias,
219 const int64_t out_features_block_size,
220 const int64_t in_features_block_size) {
221 auto& ctx = at::globalContext();
222
223 #ifdef USE_FBGEMM
224 if (ctx.qEngine() == at::QEngine::FBGEMM) {
225 return PackedLinearWeight::prepack(
226 weight, bias, out_features_block_size, in_features_block_size);
227 }
228 #endif
229 #ifdef USE_PYTORCH_QNNPACK
230 if (ctx.qEngine() == at::QEngine::QNNPACK) {
231 return PackedLinearWeightQnnp::prepack(
232 weight, bias, out_features_block_size, in_features_block_size);
233 }
234 #endif
235 TORCH_CHECK(
236 false,
237 "Didn't find engine for operation ao::sparse::qlinear_prepack ",
238 toString(ctx.qEngine()));
239 }
240 };
241
TORCH_LIBRARY_IMPL(sparse,QuantizedCPU,m)242 TORCH_LIBRARY_IMPL(sparse, QuantizedCPU, m) {
243 register_linear_params();
244 m.impl(
245 TORCH_SELECTIVE_NAME("sparse::qlinear_prepack"),
246 TORCH_FN(QLinearPackWeightInt8::run));
247 }
248 } // namespace
249 }} // namespace ao::sparse
250