1 #include <ATen/ArrayRef.h>
2 #include <ATen/ATen.h>
3 #include <ATen/ceil_div.h>
4 #include <ATen/core/Tensor.h>
5 #include <ATen/detail/CUDAHooksInterface.h>
6 #include <ATen/Dispatch.h>
7 #include <ATen/native/quantized/AffineQuantizer.h>
8 #include <ATen/native/TensorFactories.h>
9 #include <ATen/NativeFunctions.h>
10 #include <ATen/quantized/QTensorImpl.h>
11 #include <ATen/quantized/Quantizer.h>
12 #include <c10/core/CPUAllocator.h>
13 #include <c10/util/accumulate.h>
14
15 #include <cmath>
16 #include <utility>
17
18 namespace at {
19
20 namespace {
21
checkPerChannelParamDims(const Tensor & scales,const Tensor & zero_points)22 void checkPerChannelParamDims(const Tensor& scales, const Tensor& zero_points) {
23 TORCH_CHECK(scales.dim() == 1, "scale tensor must have dimension 1");
24 TORCH_CHECK(
25 zero_points.dim() == 1, "zero_points tensor must have dimension 1");
26 TORCH_CHECK(
27 scales.numel() == zero_points.numel(),
28 "number of elements in scales and zero_points must match");
29 }
30
31 } // anonymous namespace
32
33 // Note: this is not a native function as Quantizer is not exposed to python yet
quantizer() const34 QuantizerPtr TensorBase::quantizer() const {
35 // This is a terrible hack to emulate what VariableType is doing
36 at::AutoDispatchBelowAutograd mode;
37 return get_qtensorimpl(*this)->quantizer();
38 }
39
make_per_tensor_affine_quantizer(double scale,int64_t zero_point,ScalarType scalar_type)40 QuantizerPtr make_per_tensor_affine_quantizer(
41 double scale,
42 int64_t zero_point,
43 ScalarType scalar_type) {
44 return c10::make_intrusive<PerTensorAffineQuantizer>(scalar_type,
45 scale, zero_point);
46 }
47
make_per_channel_affine_quantizer(const Tensor & scales,const Tensor & zero_points,int64_t axis,ScalarType scalar_type)48 QuantizerPtr make_per_channel_affine_quantizer(
49 const Tensor& scales,
50 const Tensor& zero_points,
51 int64_t axis,
52 ScalarType scalar_type) {
53 checkPerChannelParamDims(scales, zero_points);
54 TORCH_CHECK(
55 isFloatingType(scales.scalar_type()),
56 "scale tensor must be floating point");
57
58 if (isFloatingType(zero_points.scalar_type())) {
59 Tensor scales_float = scales.to(kFloat).contiguous();
60 Tensor zero_points_float = zero_points.to(kFloat).contiguous();
61 return c10::make_intrusive<PerChannelAffineFloatQParamsQuantizer>(scalar_type,
62 scales_float,
63 zero_points_float,
64 axis);
65 }
66 else {
67 Tensor scales_double = scales.to(kDouble).contiguous();
68 Tensor zero_points_int64 = zero_points.to(kLong).contiguous();
69 return c10::make_intrusive<PerChannelAffineQuantizer>(scalar_type,
70 scales_double,
71 zero_points_int64,
72 axis);
73 }
74 }
75
get_qtensorimpl(const TensorBase & self)76 QTensorImpl* get_qtensorimpl(const TensorBase& self) {
77 TORCH_CHECK(
78 !self.requires_grad(),
79 "quantized tensors do not support autograd");
80 TORCH_INTERNAL_ASSERT(self.is_quantized(), "get_qtensorimpl: not a quantized tensor");
81 return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
82 }
83
get_sub_byte_tensor_size(IntArrayRef sizes,size_t dtype_itemsize,at::ScalarType t)84 static int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) {
85 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
86 int64_t element_per_byte;
87 switch(t) {
88 case at::ScalarType::QUInt4x2:
89 element_per_byte = 2;
90 break;
91 case at::ScalarType::QUInt2x4:
92 element_per_byte = 4;
93 break;
94 default:
95 element_per_byte = 1;
96 }
97 // zero dim tensor
98 if (sizes.empty()) {
99 return c10::multiply_integers(sizes) * dtype_itemsize;
100 }
101 // Consider most inner dim as cols
102 int64_t cols = sizes.at(sizes.size()-1);
103 int64_t bytes_per_row = cols * dtype_itemsize;
104 // align qtensor most inner dim, compute ceil (bytes_per_row / element_per_byte)
105 return c10::multiply_integers(IntArrayRef(sizes.data(), sizes.size() - 1)) * at::ceil_div(bytes_per_row, element_per_byte);
106 }
107
new_qtensor(IntArrayRef sizes,const TensorOptions & options,QuantizerPtr quantizer)108 inline Tensor new_qtensor(
109 IntArrayRef sizes,
110 const TensorOptions& options,
111 QuantizerPtr quantizer) {
112 auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
113 auto device = options.device();
114 at::Allocator* allocator = nullptr;
115 // TODO: why isn't this just using GetAllocator
116 if (device.is_cuda()) {
117 allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator();
118 } else if (device.is_cpu()) {
119 allocator = at::getCPUAllocator();
120 } else if (device.is_meta()) {
121 allocator = GetAllocator(kMeta);
122 } else if (device.is_privateuseone()) {
123 allocator = GetAllocator(kPrivateUse1);
124 } else {
125 TORCH_INTERNAL_ASSERT(0, "unrecognized device for new_qtensor: ", device);
126 }
127
128 #ifdef USE_PYTORCH_QNNPACK
129 if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
130 TORCH_CHECK(!device.is_cuda(), "It looks like you are trying to quantize a CUDA tensor ",
131 "while QNNPACK backend is enabled. Although not expected to happen in ",
132 "practice, you might have done it for testing purposes. ",
133 "Please, either change the quantization engine or move the tensor to a CPU.");
134 allocator = c10::GetDefaultMobileCPUAllocator();
135 }
136 #endif
137
138 at::DispatchKey tensorDispatchKey = options.computeDispatchKey();
139 native::check_size_nonnegative(sizes);
140 auto dtype = options.dtype();
141 TORCH_CHECK(
142 isQIntType(typeMetaToScalarType(dtype)),
143 "ScalarType ",
144 typeMetaToScalarType(dtype),
145 " is not supported in new_qtensor.");
146 auto scalar_type = typeMetaToScalarType(dtype);
147 int64_t size_bytes = get_sub_byte_tensor_size(sizes, dtype.itemsize(), scalar_type);
148
149 auto storage = make_storage_impl(
150 StorageImpl::use_byte_size_t(),
151 size_bytes,
152 allocator->allocate(size_bytes),
153 allocator,
154 /*resizable=*/true,
155 device);
156 auto tensor = detail::make_tensor<QTensorImpl>(
157 storage, at::DispatchKeySet(tensorDispatchKey), dtype, quantizer);
158 get_qtensorimpl(tensor)->set_sizes_contiguous(sizes);
159 get_qtensorimpl(tensor)->empty_tensor_restride(memory_format);
160 return tensor;
161 }
162
quantize(const Tensor & rtensor)163 Tensor PerTensorAffineQuantizer::quantize(const Tensor& rtensor) {
164 TORCH_CHECK(
165 rtensor.scalar_type() == kFloat,
166 "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
167 // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
168 // quantizer that can be reused, so I'm using intrusive_from_this here
169 Tensor qtensor = new_qtensor(
170 rtensor.sizes(),
171 rtensor.options()
172 .dtype(scalar_type_)
173 .memory_format(rtensor.suggest_memory_format()),
174 intrusive_from_this());
175
176 auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
177 native::quantize_tensor_per_tensor_affine(
178 *rtensor_contig, qtensor, scale_, zero_point_);
179 return qtensor;
180 }
181
per_tensor_affine_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const double scale,const int64_t zero_point)182 static void per_tensor_affine_dequantize_impl(
183 Tensor& rtensor,
184 const Tensor& qtensor,
185 const double scale,
186 const int64_t zero_point) {
187 const auto qtensor_contig =
188 qtensor.expect_contiguous(qtensor.suggest_memory_format());
189 native::dequantize_tensor_per_tensor_affine(
190 *qtensor_contig, rtensor, scale, zero_point);
191 }
192
dequantize_out(Tensor & rtensor,const Tensor & qtensor)193 Tensor& PerTensorAffineQuantizer::dequantize_out(
194 Tensor& rtensor, const Tensor& qtensor) {
195 rtensor.resize_(qtensor.sizes());
196 TORCH_CHECK(
197 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
198 rtensor.scalar_type() == kFloat,
199 "Dequantize out should be a contiguous Float Tensor; instead got type ",
200 rtensor.scalar_type(),
201 ", and is_contiguous ",
202 rtensor.is_contiguous(qtensor.suggest_memory_format()));
203 per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
204 return rtensor;
205 }
206
dequantize(const Tensor & qtensor)207 Tensor PerTensorAffineQuantizer::dequantize(const Tensor& qtensor) {
208 Tensor rtensor = at::empty(
209 qtensor.sizes(),
210 qtensor.options()
211 .dtype(at::kFloat)
212 .memory_format(qtensor.suggest_memory_format()));
213 per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
214 return rtensor;
215 }
216
quantize(const Tensor & rtensor)217 Tensor PerChannelAffineQuantizer::quantize(const Tensor& rtensor) {
218 // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
219 // quantizer that can be reused, so I'm using intrusive_from_this here
220 Tensor qtensor = new_qtensor(
221 rtensor.sizes(),
222 rtensor.options()
223 .dtype(scalar_type_)
224 .memory_format(rtensor.suggest_memory_format()),
225 intrusive_from_this());
226 auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
227 native::quantize_tensor_per_channel_affine(
228 *rtensor_contig, qtensor, scales_, zero_points_, axis_);
229 return qtensor;
230 }
231
per_channel_affine_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const Tensor & scale,const Tensor & zero_point,const int64_t axis)232 static void per_channel_affine_dequantize_impl(
233 Tensor& rtensor,
234 const Tensor& qtensor,
235 const Tensor& scale,
236 const Tensor& zero_point,
237 const int64_t axis) {
238 const auto qtensor_contig =
239 qtensor.expect_contiguous(qtensor.suggest_memory_format());
240 native::dequantize_tensor_per_channel_affine(
241 *qtensor_contig, rtensor, scale, zero_point, axis);
242 }
243
dequantize(const Tensor & qtensor)244 Tensor PerChannelAffineQuantizer::dequantize(const Tensor& qtensor) {
245 Tensor rtensor = at::empty(
246 qtensor.sizes(),
247 qtensor.options()
248 .dtype(at::kFloat)
249 .memory_format(qtensor.suggest_memory_format()));
250 per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
251 return rtensor;
252 }
253
dequantize_out(Tensor & rtensor,const Tensor & qtensor)254 Tensor& PerChannelAffineQuantizer::dequantize_out(
255 Tensor& rtensor, const Tensor& qtensor) {
256 rtensor.resize_(qtensor.sizes());
257 TORCH_CHECK(
258 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
259 rtensor.scalar_type() == kFloat,
260 "Dequantize out should be a contiguous Float Tensor; instead got type ",
261 rtensor.scalar_type(),
262 ", and is_contiguous ",
263 rtensor.is_contiguous(qtensor.suggest_memory_format()));
264 per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
265 return rtensor;
266 }
267
quantize(const Tensor & rtensor)268 Tensor PerChannelAffineFloatQParamsQuantizer::quantize(const Tensor& rtensor) {
269 TORCH_CHECK(
270 rtensor.scalar_type() == kFloat,
271 "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
272 Tensor qtensor = new_qtensor(
273 rtensor.sizes(),
274 rtensor.options().dtype(scalar_type_),
275 intrusive_from_this());
276 auto rtensor_contig = rtensor.expect_contiguous();
277 native::quantize_tensor_per_channel_float_qparams(
278 *rtensor_contig, qtensor, scales_, zero_points_, axis_);
279 return qtensor;
280 }
281
per_channel_affine_float_q_params_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const Tensor & scale,const Tensor & zero_point,const int64_t axis)282 static void per_channel_affine_float_q_params_dequantize_impl(
283 Tensor& rtensor,
284 const Tensor& qtensor,
285 const Tensor& scale,
286 const Tensor& zero_point,
287 const int64_t axis) {
288 const auto qtensor_contig =
289 qtensor.expect_contiguous(qtensor.suggest_memory_format());
290 native::dequantize_tensor_per_channel_float_qparams(
291 *qtensor_contig, rtensor, scale, zero_point, axis);
292 }
293
dequantize(const Tensor & qtensor)294 Tensor PerChannelAffineFloatQParamsQuantizer::dequantize(const Tensor& qtensor) {
295 Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat));
296 per_channel_affine_float_q_params_dequantize_impl(
297 rtensor, qtensor, scales_, zero_points_, axis_);
298 return rtensor;
299 }
300
dequantize_out(Tensor & rtensor,const Tensor & qtensor)301 Tensor& PerChannelAffineFloatQParamsQuantizer::dequantize_out(
302 Tensor& rtensor, const Tensor& qtensor) {
303 rtensor.resize_(qtensor.sizes());
304 TORCH_CHECK(
305 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
306 rtensor.scalar_type() == kFloat,
307 "Dequantize out should be a contiguous Float Tensor; instead got type ",
308 rtensor.scalar_type(),
309 ", and is_contiguous ",
310 rtensor.is_contiguous(qtensor.suggest_memory_format()));
311 per_channel_affine_float_q_params_dequantize_impl(
312 rtensor, qtensor, scales_, zero_points_, axis_);
313 return rtensor;
314 }
315
316 Quantizer::~Quantizer() = default;
317
set_quantizer_(const Tensor & self,ConstQuantizerPtr quantizer)318 C10_EXPORT void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer) {
319 get_qtensorimpl(self)->set_quantizer_(quantizer);
320 }
321
from_blob_quantized_per_tensor_affine(void * data,IntArrayRef sizes,IntArrayRef strides,std::function<void (void *)> deleter,const float scale,const int64_t zeroPoint,const TensorOptions & options)322 Tensor from_blob_quantized_per_tensor_affine(
323 void* data,
324 IntArrayRef sizes,
325 IntArrayRef strides,
326 std::function<void(void*)> deleter,
327 const float scale,
328 const int64_t zeroPoint,
329 const TensorOptions& options) {
330 auto dtype = typeMetaToScalarType(options.dtype());
331 TORCH_CHECK(
332 isQIntType(dtype),
333 "from_blob_quantized_per_tensor_affine expects QInt dtypes, got ", dtype);
334
335 const std::size_t itemsize = options.dtype().itemsize();
336 std::size_t size = 1;
337 for (std::int64_t s : sizes) {
338 size *= static_cast<std::size_t>(s);
339 }
340 const std::size_t datasize = size * itemsize;
341
342 DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
343 data, deleter, options.device());
344
345 Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
346
347 QuantizerPtr quantizer =
348 make_per_tensor_affine_quantizer(scale, zeroPoint, dtype);
349
350 Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
351 std::move(storage),
352 at::DispatchKeySet(options.computeDispatchKey()),
353 options.dtype(),
354 quantizer);
355 get_qtensorimpl(qtensor)->set_sizes_and_strides(sizes, strides);
356 return qtensor;
357 }
358
from_blob_quantized_per_tensor_affine(void * data,IntArrayRef sizes,std::function<void (void *)> deleter,const float scale,const int64_t zeroPoint,const TensorOptions & options)359 Tensor from_blob_quantized_per_tensor_affine(
360 void* data,
361 IntArrayRef sizes,
362 std::function<void(void*)> deleter,
363 const float scale,
364 const int64_t zeroPoint,
365 const TensorOptions& options) {
366 std::vector<int64_t> strides;
367 const auto ndim = sizes.size();
368 if (ndim > 0) {
369 strides.resize(ndim);
370 // NOLINTNEXTLINE
371 int32_t i = ndim - 1;
372 // NOLINTNEXTLINE
373 strides[i] = 1;
374 while (--i >= 0) {
375 strides[i] = sizes[i + 1] * strides[i + 1];
376 }
377 }
378 return from_blob_quantized_per_tensor_affine(
379 data,
380 sizes,
381 strides,
382 std::move(deleter),
383 scale,
384 zeroPoint,
385 options);
386 }
387
from_blob_quantized_per_channel_affine(void * data,IntArrayRef sizes,std::function<void (void *)> deleter,const Tensor & scales,const Tensor & zero_points,const int64_t axis,const TensorOptions & options)388 Tensor from_blob_quantized_per_channel_affine(
389 void* data,
390 IntArrayRef sizes,
391 std::function<void(void*)> deleter,
392 const Tensor& scales,
393 const Tensor& zero_points,
394 const int64_t axis,
395 const TensorOptions& options) {
396 checkPerChannelParamDims(scales, zero_points);
397 int64_t channel = sizes[axis];
398 TORCH_CHECK(
399 channel == int64_t(scales.numel()),
400 "length of scales must equal to channel, expected ", channel, " got, ", scales.numel());
401 TORCH_CHECK(
402 channel == int64_t(zero_points.numel()),
403 "length of zero_points must equal to channel, expected ", channel, " got, ", zero_points.numel());
404
405 auto dtype = typeMetaToScalarType(options.dtype());
406 TORCH_CHECK(
407 isQIntType(dtype),
408 "from_blob_quantized_per_channel_affine expects QInt dtypes, got ", dtype);
409
410 const std::size_t itemsize = options.dtype().itemsize();
411 std::size_t size = 1;
412 for (std::int64_t s : sizes) {
413 size *= static_cast<std::size_t>(s);
414 }
415 const std::size_t datasize = size * itemsize;
416
417 DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
418 data, deleter, options.device());
419
420 Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
421
422 QuantizerPtr quantizer =
423 make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
424
425 Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
426 std::move(storage),
427 at::DispatchKeySet(options.computeDispatchKey()),
428 options.dtype(),
429 quantizer);
430 get_qtensorimpl(qtensor)->set_sizes_contiguous(sizes);
431
432 return qtensor;
433 }
434
quantize(const Tensor & tensor)435 Tensor UnknownQuantizer::quantize(const Tensor& tensor) {
436 TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer");
437 }
dequantize(const Tensor & qtensor)438 Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) {
439 TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer");
440 }
dequantize_out(Tensor & rtensor,const Tensor & qtensor)441 Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) {
442 TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer");
443 }
qscheme() const444 QScheme UnknownQuantizer::qscheme() const {
445 TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer");
446 }
equalTo(QuantizerPtr other) const447 bool UnknownQuantizer::equalTo(QuantizerPtr other) const{
448 TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer");
449 }
make_unknown_quantizer(ScalarType scalar_type)450 QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) {
451 return c10::make_intrusive<UnknownQuantizer>(scalar_type);
452 }
453
454 } // namespace at
455