xref: /aosp_15_r20/external/pytorch/aten/src/ATen/quantized/Quantizer.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/ArrayRef.h>
2 #include <ATen/ATen.h>
3 #include <ATen/ceil_div.h>
4 #include <ATen/core/Tensor.h>
5 #include <ATen/detail/CUDAHooksInterface.h>
6 #include <ATen/Dispatch.h>
7 #include <ATen/native/quantized/AffineQuantizer.h>
8 #include <ATen/native/TensorFactories.h>
9 #include <ATen/NativeFunctions.h>
10 #include <ATen/quantized/QTensorImpl.h>
11 #include <ATen/quantized/Quantizer.h>
12 #include <c10/core/CPUAllocator.h>
13 #include <c10/util/accumulate.h>
14 
15 #include <cmath>
16 #include <utility>
17 
18 namespace at {
19 
20 namespace {
21 
checkPerChannelParamDims(const Tensor & scales,const Tensor & zero_points)22   void checkPerChannelParamDims(const Tensor& scales, const Tensor& zero_points) {
23     TORCH_CHECK(scales.dim() == 1, "scale tensor must have dimension 1");
24     TORCH_CHECK(
25         zero_points.dim() == 1, "zero_points tensor must have dimension 1");
26     TORCH_CHECK(
27         scales.numel() == zero_points.numel(),
28         "number of elements in scales and zero_points must match");
29   }
30 
31 } // anonymous namespace
32 
33 // Note: this is not a native function as Quantizer is not exposed to python yet
quantizer() const34 QuantizerPtr TensorBase::quantizer() const {
35   // This is a terrible hack to emulate what VariableType is doing
36   at::AutoDispatchBelowAutograd mode;
37   return get_qtensorimpl(*this)->quantizer();
38 }
39 
make_per_tensor_affine_quantizer(double scale,int64_t zero_point,ScalarType scalar_type)40 QuantizerPtr make_per_tensor_affine_quantizer(
41     double scale,
42     int64_t zero_point,
43     ScalarType scalar_type) {
44   return c10::make_intrusive<PerTensorAffineQuantizer>(scalar_type,
45       scale, zero_point);
46 }
47 
make_per_channel_affine_quantizer(const Tensor & scales,const Tensor & zero_points,int64_t axis,ScalarType scalar_type)48 QuantizerPtr make_per_channel_affine_quantizer(
49     const Tensor& scales,
50     const Tensor& zero_points,
51     int64_t axis,
52     ScalarType scalar_type) {
53   checkPerChannelParamDims(scales, zero_points);
54   TORCH_CHECK(
55       isFloatingType(scales.scalar_type()),
56       "scale tensor must be floating point");
57 
58   if (isFloatingType(zero_points.scalar_type())) {
59     Tensor scales_float = scales.to(kFloat).contiguous();
60     Tensor zero_points_float = zero_points.to(kFloat).contiguous();
61     return c10::make_intrusive<PerChannelAffineFloatQParamsQuantizer>(scalar_type,
62                                                                       scales_float,
63                                                                       zero_points_float,
64                                                                       axis);
65   }
66   else {
67     Tensor scales_double = scales.to(kDouble).contiguous();
68     Tensor zero_points_int64 = zero_points.to(kLong).contiguous();
69     return c10::make_intrusive<PerChannelAffineQuantizer>(scalar_type,
70                                                           scales_double,
71                                                           zero_points_int64,
72                                                           axis);
73   }
74 }
75 
get_qtensorimpl(const TensorBase & self)76 QTensorImpl* get_qtensorimpl(const TensorBase& self) {
77   TORCH_CHECK(
78       !self.requires_grad(),
79       "quantized tensors do not support autograd");
80   TORCH_INTERNAL_ASSERT(self.is_quantized(), "get_qtensorimpl: not a quantized tensor");
81   return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
82 }
83 
get_sub_byte_tensor_size(IntArrayRef sizes,size_t dtype_itemsize,at::ScalarType t)84 static int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) {
85   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
86   int64_t element_per_byte;
87   switch(t) {
88     case at::ScalarType::QUInt4x2:
89       element_per_byte = 2;
90       break;
91     case at::ScalarType::QUInt2x4:
92       element_per_byte = 4;
93       break;
94     default:
95       element_per_byte = 1;
96   }
97   // zero dim tensor
98   if (sizes.empty()) {
99     return c10::multiply_integers(sizes) * dtype_itemsize;
100   }
101   // Consider most inner dim as cols
102   int64_t cols = sizes.at(sizes.size()-1);
103   int64_t bytes_per_row = cols * dtype_itemsize;
104   // align qtensor most inner dim, compute ceil (bytes_per_row / element_per_byte)
105   return c10::multiply_integers(IntArrayRef(sizes.data(), sizes.size() - 1)) * at::ceil_div(bytes_per_row, element_per_byte);
106 }
107 
new_qtensor(IntArrayRef sizes,const TensorOptions & options,QuantizerPtr quantizer)108 inline Tensor new_qtensor(
109     IntArrayRef sizes,
110     const TensorOptions& options,
111     QuantizerPtr quantizer) {
112   auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
113   auto device = options.device();
114   at::Allocator* allocator = nullptr;
115   // TODO: why isn't this just using GetAllocator
116   if (device.is_cuda()) {
117     allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator();
118   } else if (device.is_cpu()) {
119     allocator = at::getCPUAllocator();
120   } else if (device.is_meta()) {
121     allocator = GetAllocator(kMeta);
122   } else if (device.is_privateuseone()) {
123     allocator = GetAllocator(kPrivateUse1);
124   } else {
125     TORCH_INTERNAL_ASSERT(0, "unrecognized device for new_qtensor: ", device);
126   }
127 
128 #ifdef USE_PYTORCH_QNNPACK
129   if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
130     TORCH_CHECK(!device.is_cuda(), "It looks like you are trying to quantize a CUDA tensor ",
131                 "while QNNPACK backend is enabled. Although not expected to happen in ",
132                 "practice, you might have done it for testing purposes. ",
133                 "Please, either change the quantization engine or move the tensor to a CPU.");
134     allocator = c10::GetDefaultMobileCPUAllocator();
135   }
136 #endif
137 
138   at::DispatchKey tensorDispatchKey = options.computeDispatchKey();
139   native::check_size_nonnegative(sizes);
140   auto dtype = options.dtype();
141   TORCH_CHECK(
142       isQIntType(typeMetaToScalarType(dtype)),
143       "ScalarType ",
144       typeMetaToScalarType(dtype),
145       " is not supported in new_qtensor.");
146   auto scalar_type = typeMetaToScalarType(dtype);
147   int64_t size_bytes = get_sub_byte_tensor_size(sizes, dtype.itemsize(), scalar_type);
148 
149   auto storage = make_storage_impl(
150       StorageImpl::use_byte_size_t(),
151       size_bytes,
152       allocator->allocate(size_bytes),
153       allocator,
154       /*resizable=*/true,
155       device);
156   auto tensor = detail::make_tensor<QTensorImpl>(
157       storage, at::DispatchKeySet(tensorDispatchKey), dtype, quantizer);
158   get_qtensorimpl(tensor)->set_sizes_contiguous(sizes);
159   get_qtensorimpl(tensor)->empty_tensor_restride(memory_format);
160   return tensor;
161 }
162 
quantize(const Tensor & rtensor)163 Tensor PerTensorAffineQuantizer::quantize(const Tensor& rtensor) {
164   TORCH_CHECK(
165       rtensor.scalar_type() == kFloat,
166       "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
167   // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
168   // quantizer that can be reused, so I'm using intrusive_from_this here
169   Tensor qtensor = new_qtensor(
170       rtensor.sizes(),
171       rtensor.options()
172           .dtype(scalar_type_)
173           .memory_format(rtensor.suggest_memory_format()),
174       intrusive_from_this());
175 
176   auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
177   native::quantize_tensor_per_tensor_affine(
178       *rtensor_contig, qtensor, scale_, zero_point_);
179   return qtensor;
180 }
181 
per_tensor_affine_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const double scale,const int64_t zero_point)182 static void per_tensor_affine_dequantize_impl(
183     Tensor& rtensor,
184     const Tensor& qtensor,
185     const double scale,
186     const int64_t zero_point) {
187   const auto qtensor_contig =
188     qtensor.expect_contiguous(qtensor.suggest_memory_format());
189   native::dequantize_tensor_per_tensor_affine(
190       *qtensor_contig, rtensor, scale, zero_point);
191 }
192 
dequantize_out(Tensor & rtensor,const Tensor & qtensor)193 Tensor& PerTensorAffineQuantizer::dequantize_out(
194     Tensor& rtensor, const Tensor& qtensor) {
195   rtensor.resize_(qtensor.sizes());
196   TORCH_CHECK(
197       rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
198       rtensor.scalar_type() == kFloat,
199       "Dequantize out should be a contiguous Float Tensor; instead got type ",
200       rtensor.scalar_type(),
201       ", and is_contiguous ",
202       rtensor.is_contiguous(qtensor.suggest_memory_format()));
203   per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
204   return rtensor;
205 }
206 
dequantize(const Tensor & qtensor)207 Tensor PerTensorAffineQuantizer::dequantize(const Tensor& qtensor) {
208   Tensor rtensor = at::empty(
209       qtensor.sizes(),
210       qtensor.options()
211           .dtype(at::kFloat)
212           .memory_format(qtensor.suggest_memory_format()));
213   per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
214   return rtensor;
215 }
216 
quantize(const Tensor & rtensor)217 Tensor PerChannelAffineQuantizer::quantize(const Tensor& rtensor) {
218   // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
219   // quantizer that can be reused, so I'm using intrusive_from_this here
220   Tensor qtensor = new_qtensor(
221       rtensor.sizes(),
222       rtensor.options()
223           .dtype(scalar_type_)
224           .memory_format(rtensor.suggest_memory_format()),
225       intrusive_from_this());
226   auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
227   native::quantize_tensor_per_channel_affine(
228       *rtensor_contig, qtensor, scales_, zero_points_, axis_);
229   return qtensor;
230 }
231 
per_channel_affine_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const Tensor & scale,const Tensor & zero_point,const int64_t axis)232 static void per_channel_affine_dequantize_impl(
233     Tensor& rtensor,
234     const Tensor& qtensor,
235     const Tensor& scale,
236     const Tensor& zero_point,
237     const int64_t axis) {
238   const auto qtensor_contig =
239     qtensor.expect_contiguous(qtensor.suggest_memory_format());
240   native::dequantize_tensor_per_channel_affine(
241       *qtensor_contig, rtensor, scale, zero_point, axis);
242 }
243 
dequantize(const Tensor & qtensor)244 Tensor PerChannelAffineQuantizer::dequantize(const Tensor& qtensor) {
245   Tensor rtensor = at::empty(
246       qtensor.sizes(),
247       qtensor.options()
248           .dtype(at::kFloat)
249           .memory_format(qtensor.suggest_memory_format()));
250   per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
251   return rtensor;
252 }
253 
dequantize_out(Tensor & rtensor,const Tensor & qtensor)254 Tensor& PerChannelAffineQuantizer::dequantize_out(
255     Tensor& rtensor, const Tensor& qtensor) {
256   rtensor.resize_(qtensor.sizes());
257   TORCH_CHECK(
258       rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
259       rtensor.scalar_type() == kFloat,
260       "Dequantize out should be a contiguous Float Tensor; instead got type ",
261       rtensor.scalar_type(),
262       ", and is_contiguous ",
263       rtensor.is_contiguous(qtensor.suggest_memory_format()));
264   per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
265   return rtensor;
266 }
267 
quantize(const Tensor & rtensor)268 Tensor PerChannelAffineFloatQParamsQuantizer::quantize(const Tensor& rtensor) {
269  TORCH_CHECK(
270       rtensor.scalar_type() == kFloat,
271       "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
272  Tensor qtensor = new_qtensor(
273       rtensor.sizes(),
274       rtensor.options().dtype(scalar_type_),
275       intrusive_from_this());
276  auto rtensor_contig = rtensor.expect_contiguous();
277  native::quantize_tensor_per_channel_float_qparams(
278    *rtensor_contig, qtensor, scales_, zero_points_, axis_);
279   return qtensor;
280 }
281 
per_channel_affine_float_q_params_dequantize_impl(Tensor & rtensor,const Tensor & qtensor,const Tensor & scale,const Tensor & zero_point,const int64_t axis)282 static void per_channel_affine_float_q_params_dequantize_impl(
283     Tensor& rtensor,
284     const Tensor& qtensor,
285     const Tensor& scale,
286     const Tensor& zero_point,
287     const int64_t axis) {
288   const auto qtensor_contig =
289     qtensor.expect_contiguous(qtensor.suggest_memory_format());
290   native::dequantize_tensor_per_channel_float_qparams(
291       *qtensor_contig, rtensor, scale, zero_point, axis);
292 }
293 
dequantize(const Tensor & qtensor)294 Tensor PerChannelAffineFloatQParamsQuantizer::dequantize(const Tensor& qtensor) {
295   Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat));
296   per_channel_affine_float_q_params_dequantize_impl(
297       rtensor, qtensor, scales_, zero_points_, axis_);
298   return rtensor;
299 }
300 
dequantize_out(Tensor & rtensor,const Tensor & qtensor)301 Tensor& PerChannelAffineFloatQParamsQuantizer::dequantize_out(
302     Tensor& rtensor, const Tensor& qtensor) {
303   rtensor.resize_(qtensor.sizes());
304   TORCH_CHECK(
305       rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
306       rtensor.scalar_type() == kFloat,
307       "Dequantize out should be a contiguous Float Tensor; instead got type ",
308       rtensor.scalar_type(),
309       ", and is_contiguous ",
310       rtensor.is_contiguous(qtensor.suggest_memory_format()));
311   per_channel_affine_float_q_params_dequantize_impl(
312       rtensor, qtensor, scales_, zero_points_, axis_);
313   return rtensor;
314 }
315 
316 Quantizer::~Quantizer() = default;
317 
set_quantizer_(const Tensor & self,ConstQuantizerPtr quantizer)318 C10_EXPORT void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer) {
319   get_qtensorimpl(self)->set_quantizer_(quantizer);
320 }
321 
from_blob_quantized_per_tensor_affine(void * data,IntArrayRef sizes,IntArrayRef strides,std::function<void (void *)> deleter,const float scale,const int64_t zeroPoint,const TensorOptions & options)322 Tensor from_blob_quantized_per_tensor_affine(
323     void* data,
324     IntArrayRef sizes,
325     IntArrayRef strides,
326     std::function<void(void*)> deleter,
327     const float scale,
328     const int64_t zeroPoint,
329     const TensorOptions& options) {
330   auto dtype = typeMetaToScalarType(options.dtype());
331   TORCH_CHECK(
332       isQIntType(dtype),
333       "from_blob_quantized_per_tensor_affine expects QInt dtypes, got ", dtype);
334 
335   const std::size_t itemsize = options.dtype().itemsize();
336   std::size_t size = 1;
337   for (std::int64_t s : sizes) {
338     size *= static_cast<std::size_t>(s);
339   }
340   const std::size_t datasize = size * itemsize;
341 
342   DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
343       data, deleter, options.device());
344 
345   Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
346 
347   QuantizerPtr quantizer =
348       make_per_tensor_affine_quantizer(scale, zeroPoint, dtype);
349 
350   Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
351       std::move(storage),
352       at::DispatchKeySet(options.computeDispatchKey()),
353       options.dtype(),
354       quantizer);
355   get_qtensorimpl(qtensor)->set_sizes_and_strides(sizes, strides);
356   return qtensor;
357 }
358 
from_blob_quantized_per_tensor_affine(void * data,IntArrayRef sizes,std::function<void (void *)> deleter,const float scale,const int64_t zeroPoint,const TensorOptions & options)359 Tensor from_blob_quantized_per_tensor_affine(
360     void* data,
361     IntArrayRef sizes,
362     std::function<void(void*)> deleter,
363     const float scale,
364     const int64_t zeroPoint,
365     const TensorOptions& options) {
366   std::vector<int64_t> strides;
367   const auto ndim = sizes.size();
368   if (ndim > 0) {
369     strides.resize(ndim);
370     // NOLINTNEXTLINE
371     int32_t i = ndim - 1;
372     // NOLINTNEXTLINE
373     strides[i] = 1;
374     while (--i >= 0) {
375       strides[i] = sizes[i + 1] * strides[i + 1];
376     }
377   }
378   return from_blob_quantized_per_tensor_affine(
379       data,
380       sizes,
381       strides,
382       std::move(deleter),
383       scale,
384       zeroPoint,
385       options);
386 }
387 
from_blob_quantized_per_channel_affine(void * data,IntArrayRef sizes,std::function<void (void *)> deleter,const Tensor & scales,const Tensor & zero_points,const int64_t axis,const TensorOptions & options)388 Tensor from_blob_quantized_per_channel_affine(
389     void* data,
390     IntArrayRef sizes,
391     std::function<void(void*)> deleter,
392     const Tensor& scales,
393     const Tensor& zero_points,
394     const int64_t axis,
395     const TensorOptions& options) {
396   checkPerChannelParamDims(scales, zero_points);
397   int64_t channel = sizes[axis];
398   TORCH_CHECK(
399       channel == int64_t(scales.numel()),
400       "length of scales must equal to channel, expected ", channel, " got, ", scales.numel());
401   TORCH_CHECK(
402       channel == int64_t(zero_points.numel()),
403       "length of zero_points must equal to channel, expected ", channel, " got, ", zero_points.numel());
404 
405   auto dtype = typeMetaToScalarType(options.dtype());
406   TORCH_CHECK(
407       isQIntType(dtype),
408       "from_blob_quantized_per_channel_affine expects QInt dtypes, got ", dtype);
409 
410   const std::size_t itemsize = options.dtype().itemsize();
411   std::size_t size = 1;
412   for (std::int64_t s : sizes) {
413     size *= static_cast<std::size_t>(s);
414   }
415   const std::size_t datasize = size * itemsize;
416 
417   DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
418       data, deleter, options.device());
419 
420   Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
421 
422   QuantizerPtr quantizer =
423       make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
424 
425   Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
426       std::move(storage),
427       at::DispatchKeySet(options.computeDispatchKey()),
428       options.dtype(),
429       quantizer);
430   get_qtensorimpl(qtensor)->set_sizes_contiguous(sizes);
431 
432   return qtensor;
433 }
434 
quantize(const Tensor & tensor)435 Tensor UnknownQuantizer::quantize(const Tensor& tensor) {
436   TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer");
437 }
dequantize(const Tensor & qtensor)438 Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) {
439   TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer");
440 }
dequantize_out(Tensor & rtensor,const Tensor & qtensor)441 Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) {
442   TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer");
443 }
qscheme() const444 QScheme UnknownQuantizer::qscheme() const {
445   TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer");
446 }
equalTo(QuantizerPtr other) const447 bool UnknownQuantizer::equalTo(QuantizerPtr other) const{
448   TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer");
449 }
make_unknown_quantizer(ScalarType scalar_type)450 QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) {
451   return c10::make_intrusive<UnknownQuantizer>(scalar_type);
452 }
453 
454 } // namespace at
455