1 #pragma once 2 3 #include <c10/core/QScheme.h> 4 #include <c10/core/MemoryFormat.h> 5 #include <c10/macros/Macros.h> 6 #include <c10/util/Exception.h> 7 #include <c10/util/intrusive_ptr.h> 8 #include <c10/core/ScalarType.h> 9 #include <c10/core/TensorOptions.h> 10 11 #include <ATen/Tensor.h> 12 #include <ATen/TensorUtils.h> 13 14 #include <ATen/core/QuantizerBase.h> 15 16 #include <cmath> 17 #include <memory> 18 #include <utility> 19 20 namespace at { 21 22 /** 23 * UnknownQuantizer is a placeholder quantizer for functions that implement 24 * quantization in a two step process. First a tensor is allocated but with 25 * unknown quantizer, and then the quantization kernel decides what the final 26 * quantizer will be. 27 */ 28 struct TORCH_API UnknownQuantizer : public Quantizer { UnknownQuantizerUnknownQuantizer29 explicit UnknownQuantizer(ScalarType scalar_type) 30 : Quantizer(scalar_type) {} 31 32 Tensor quantize(const Tensor& tensor) override; 33 Tensor dequantize(const Tensor& qtensor) override; 34 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; 35 QScheme qscheme() const override; 36 bool equalTo(QuantizerPtr other) const override; 37 }; 38 39 /** 40 * UniformQuantizer is the parent class for all uniform quantizers. 41 * These quantization scheme will map float value uniformly to 42 * the quantized value. For example, affine quantizer is 43 * the most commonly used scheme in this category. 44 */ 45 struct TORCH_API UniformQuantizer : public Quantizer { UniformQuantizerUniformQuantizer46 explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} 47 }; 48 49 /** 50 * NonUniformQuantizer is the parent class for all non-uniform quantizers. 51 * These quantization scheme may map float value non-uniformly to the quantized 52 * value. K-means quantization is a representative example in this category. 53 */ 54 struct TORCH_API NonUniformQuantizer : public Quantizer { NonUniformQuantizerNonUniformQuantizer55 explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} 56 }; 57 58 // There is also StochasticQuantizer which is uniform but not affine 59 60 /** 61 * AffineQuantizer uses affine transformation to do quantization. 62 * 63 * For quantize: 64 * Y = clamp(round(X / scale + zero_point), min, max) 65 * For dequantize: 66 * X = (Y - zero_point) * scale 67 */ 68 struct TORCH_API AffineQuantizer : public UniformQuantizer { AffineQuantizerAffineQuantizer69 explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {} 70 }; 71 72 // Note that we will not have Symmetric Quantizer in backend to reduce 73 // complications in quantized kernel implementation. 74 75 /** 76 * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for 77 * all the values in the Tensor. 78 */ 79 struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer { PerTensorAffineQuantizerPerTensorAffineQuantizer80 explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point) 81 : AffineQuantizer(scalar_type), 82 scale_(scale), 83 zero_point_(zero_point) {} 84 85 Tensor quantize(const Tensor& tensor) override; 86 Tensor dequantize(const Tensor& qtensor) override; 87 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; 88 qschemePerTensorAffineQuantizer89 QScheme qscheme() const override { 90 return kPerTensorAffine; 91 } 92 scalePerTensorAffineQuantizer93 double scale() const { 94 return scale_; 95 } 96 zero_pointPerTensorAffineQuantizer97 int64_t zero_point() const { 98 return zero_point_; 99 } 100 equalToPerTensorAffineQuantizer101 bool equalTo(QuantizerPtr other) const override { 102 if (!other.get() || other->qscheme() != kPerTensorAffine) { 103 return false; 104 } 105 auto* other_per_tensor_affine = 106 static_cast<PerTensorAffineQuantizer*>(other.get()); 107 return scalar_type() == other_per_tensor_affine->scalar_type() && 108 scale() == other_per_tensor_affine->scale() && 109 zero_point() == other_per_tensor_affine->zero_point(); 110 } 111 112 private: 113 const double scale_; 114 // We use int64_t for consistency with Python 115 const int64_t zero_point_; 116 }; 117 118 /** 119 * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer 120 * except that we have an independent scale and zero_point parameter 121 * for each channel. 122 * 123 * Also note that per channel quantization is mostly applied to output channels 124 * of weights since per-input channel of weight quantization or per-channel 125 * quantization for activations can't be efficiently supported in most of 126 * processors since it requires each multiplication result within a single 127 * dot-product to have a different scale. 128 */ 129 struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer { PerChannelAffineQuantizerPerChannelAffineQuantizer130 explicit PerChannelAffineQuantizer( 131 ScalarType scalar_type, 132 Tensor scales, 133 Tensor zero_points, 134 int64_t axis) 135 : AffineQuantizer(scalar_type), 136 scales_(std::move(scales)), 137 zero_points_(std::move(zero_points)), 138 axis_(axis) {} 139 qschemePerChannelAffineQuantizer140 QScheme qscheme() const override { 141 return kPerChannelAffine; 142 } 143 scalesPerChannelAffineQuantizer144 Tensor scales() const { 145 return scales_; 146 } 147 zero_pointsPerChannelAffineQuantizer148 Tensor zero_points() const { 149 return zero_points_; 150 } 151 axisPerChannelAffineQuantizer152 int64_t axis() const { 153 return axis_; 154 } 155 156 Tensor quantize(const Tensor& tensor) override; 157 Tensor dequantize(const Tensor& qtensor) override; 158 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; 159 equalToPerChannelAffineQuantizer160 bool equalTo(QuantizerPtr other) const override { 161 if (!other.get() || other->qscheme() != kPerChannelAffine) { 162 return false; 163 } 164 auto* other_per_channel_affine = 165 static_cast<PerChannelAffineQuantizer*>(other.get()); 166 return scalar_type() == other_per_channel_affine->scalar_type() && 167 scales().equal(other_per_channel_affine->scales()) && 168 zero_points().equal(other_per_channel_affine->zero_points()) && 169 axis() == other_per_channel_affine->axis(); 170 } 171 172 protected: 173 Tensor scales_; 174 Tensor zero_points_; 175 const int64_t axis_; 176 }; 177 178 /** 179 * PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer 180 * except that it expects both scale and zero point to be floating point values. 181 * 182 * This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of 183 * kPerChannelAffine. 184 * 185 * The quantize equation in this case looks like - 186 * Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale 187 * 188 * Note: Usage of floating point zero point is useful in cases where 0 doesn't need to 189 * be exactly represented in the quantized space. We can get additional precision by 190 * using floating point values for zero point. 191 */ 192 struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer { PerChannelAffineFloatQParamsQuantizerPerChannelAffineFloatQParamsQuantizer193 explicit PerChannelAffineFloatQParamsQuantizer( 194 ScalarType scalar_type, 195 Tensor scales, 196 Tensor zero_points, 197 int64_t axis) 198 : PerChannelAffineQuantizer(scalar_type, 199 scales, 200 zero_points, 201 axis) {} 202 qschemePerChannelAffineFloatQParamsQuantizer203 QScheme qscheme() const override { 204 return kPerChannelAffineFloatQParams; 205 } 206 207 Tensor quantize(const Tensor& tensor) override; 208 Tensor dequantize(const Tensor& qtensor) override; 209 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; 210 equalToPerChannelAffineFloatQParamsQuantizer211 bool equalTo(QuantizerPtr other) const override { 212 if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) { 213 return false; 214 } 215 auto* other_per_channel_float_qparams = 216 static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get()); 217 return scalar_type() == other_per_channel_float_qparams->scalar_type() && 218 scales().equal(other_per_channel_float_qparams->scales()) && 219 zero_points().equal(other_per_channel_float_qparams->zero_points()) && 220 axis() == other_per_channel_float_qparams->axis(); 221 } 222 }; 223 224 // This is an internal utility function for getting at the QTensorImpl, 225 // You should only use this for writing low level 226 // setters/getters for QTensorImpl fields; otherwise, you should use 227 // the low level setters/getters that were implemented using this. 228 // This may be called repeatedly, so make sure it's pretty cheap. 229 TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self); 230 231 // double and int64_t are because of the native function API, we only have these 232 // argument types right now in native functions 233 TORCH_API QuantizerPtr 234 make_per_tensor_affine_quantizer( 235 double scale, int64_t zero_point, ScalarType scalar_type); 236 237 TORCH_API QuantizerPtr make_per_channel_affine_quantizer( 238 const Tensor& scales, 239 const Tensor& zero_points, 240 int64_t axis, 241 ScalarType scalar_type); 242 243 TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type); 244 245 // Create a Quantized Tensor given arguments for normal Tensor and a quantizer 246 TORCH_API Tensor new_qtensor( 247 IntArrayRef sizes, 248 const TensorOptions& options, 249 QuantizerPtr quantizer); 250 251 TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer); 252 253 TORCH_API Tensor from_blob_quantized_per_tensor_affine( 254 void* data, 255 IntArrayRef sizes, 256 IntArrayRef strides, 257 std::function<void(void*)> deleter, 258 const float scale, 259 const int64_t zeroPoint, 260 const TensorOptions& options); 261 262 TORCH_API Tensor from_blob_quantized_per_tensor_affine( 263 void* data, 264 IntArrayRef sizes, 265 std::function<void(void*)> deleter, 266 const float scale, 267 const int64_t zeroPoint, 268 const TensorOptions& options); 269 270 TORCH_API Tensor from_blob_quantized_per_channel_affine( 271 void* data, 272 IntArrayRef sizes, 273 std::function<void(void*)> deleter, 274 const Tensor& scales, 275 const Tensor& zero_points, 276 const int64_t axis, 277 const TensorOptions& options); 278 279 } // namespace at 280