xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/quantization_util.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/kernels/internal/quantization_util.h"
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <limits>
21 
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/cppmath.h"
24 
25 namespace tflite {
26 
27 namespace {
28 // These constants are used to manipulate the binary representation of doubles.
29 // Double-precision binary64 floating point format is:
30 // Bit |  63  |  62-52   |   51-0   |
31 //     | Sign | Exponent | Fraction |
32 // To avoid 64-bit integers as much as possible, I break this into high and
33 // low 32-bit chunks. High is:
34 // Bit |  31  |  30-20   |      19-0     |
35 //     | Sign | Exponent | High Fraction |
36 // Low is:
37 // Bit |     31-0     |
38 //     | Low Fraction |
39 // We then access the components through logical bit-wise operations to
40 // extract the parts needed, with the positions and masks derived from the
41 // layout shown above.
42 constexpr uint64_t kSignMask = 0x8000000000000000LL;
43 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
44 constexpr int32_t kExponentShift = 52;
45 constexpr int32_t kExponentBias = 1023;
46 constexpr uint32_t kExponentIsBadNum = 0x7ff;
47 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
48 constexpr uint32_t kFractionShift = 22;
49 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
50 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
51 }  // namespace
52 
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)53 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
54                         int* shift) {
55 #if TFLITE_SINGLE_ROUNDING
56   // Single-rounding MultiplyByQuantizedMultiplier only supports positive
57   // multipliers.
58   // TFLITE_DCHECK(double_multiplier >= 0);
59 #endif
60   if (double_multiplier == 0.) {
61     *quantized_multiplier = 0;
62     *shift = 0;
63     return;
64   }
65 #ifdef TFLITE_EMULATE_FLOAT
66   // If we're trying to avoid the use of floating-point instructions (for
67   // example on microcontrollers) then use an alternative implementation
68   // that only requires integer and bitwise operations. To enable this, you
69   // need to set the define during the build process for your platform.
70   int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
71 #else   // TFLITE_EMULATE_FLOAT
72   const double q = std::frexp(double_multiplier, shift);
73   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1LL << 31)));
74 #endif  // TFLITE_EMULATE_FLOAT
75   TFLITE_CHECK(q_fixed <= (1LL << 31));
76   if (q_fixed == (1LL << 31)) {
77     q_fixed /= 2;
78     ++*shift;
79   }
80   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
81   // A shift amount smaller than -31 would cause all bits to be shifted out
82   // and thus all results would be zero. We implement that instead with
83   // q_fixed==0, so as to avoid hitting issues with right-shift
84   // operations with shift amounts greater than 31. Note that this happens
85   // roughly when abs(double_multiplier) < 2^-31 and the present handling means
86   // that we're effectively flushing tiny double_multiplier's to zero.
87   // We could conceivably handle values in the range (roughly) [32, 63]
88   // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
89   // the present handling is just doing 'flush denormals to zero'. We could
90   // reconsider and actually generate nonzero denormals if a need arises.
91   if (*shift < -31) {
92     *shift = 0;
93     q_fixed = 0;
94   }
95 #if TFLITE_SINGLE_ROUNDING
96   // Single-rounding MultiplyByQuantizedMultiplier doesn't support a shift > 30,
97   // saturate it.
98   if (*shift > 30) {
99     *shift = 30;
100     q_fixed = (1LL << 31) - 1;
101   }
102 #endif
103   *quantized_multiplier = static_cast<int32_t>(q_fixed);
104 }
105 
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)106 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
107                                       int32_t* quantized_multiplier,
108                                       int* left_shift) {
109   TFLITE_CHECK_GT(double_multiplier, 1.);
110   QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
111   TFLITE_CHECK_GE(*left_shift, 0);
112 }
113 
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)114 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
115                                          int32_t* quantized_multiplier,
116                                          int* left_shift) {
117   TFLITE_CHECK_LT(double_multiplier, 1.);
118   TFLITE_CHECK_GT(double_multiplier, 0.);
119   int shift;
120   QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
121   TFLITE_CHECK_LE(shift, 0);
122   *left_shift = shift;
123 }
124 
IntegerFrExp(double input,int * shift)125 int64_t IntegerFrExp(double input, int* shift) {
126   // Make sure our assumptions about the double layout hold.
127   TFLITE_CHECK_EQ(8, sizeof(double));
128 
129   // We want to access the bits of the input double value directly, which is
130   // tricky to do safely, so use a union to handle the casting.
131   union {
132     double double_value;
133     uint64_t double_as_uint;
134   } cast_union;
135   cast_union.double_value = input;
136   const uint64_t u = cast_union.double_as_uint;
137 
138   // If the bitfield is all zeros apart from the sign bit, this is a normalized
139   // zero value, so return standard values for this special case.
140   if ((u & ~kSignMask) == 0) {
141     *shift = 0;
142     return 0;
143   }
144 
145   // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
146   // the exponent, and distinguished by whether the fractions are zero or
147   // non-zero.
148   const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
149   if (exponent_part == kExponentIsBadNum) {
150     *shift = std::numeric_limits<int>::max();
151     if (u & kFractionMask) {
152       // NaN, so just return zero (with the exponent set to INT_MAX).
153       return 0;
154     } else {
155       // Infinity, so return +/- INT_MAX.
156       if (u & kSignMask) {
157         return std::numeric_limits<int64_t>::min();
158       } else {
159         return std::numeric_limits<int64_t>::max();
160       }
161     }
162   }
163 
164   // The shift is fairly easy to extract from the high bits of the double value,
165   // just by masking it out and applying a bias. The std::frexp() implementation
166   // always returns values between 0.5 and 1.0 though, whereas the exponent
167   // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
168   // interface.
169   *shift = (exponent_part - kExponentBias) + 1;
170 
171   // There's an implicit high bit in the double format definition, so make sure
172   // we include that at the top, and then reconstruct the rest of the fractional
173   // value from the remaining fragments.
174   int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
175 
176   // We're cutting off some bits at the bottom, so to exactly match the standard
177   // frexp implementation here we'll apply rounding by adding one to the least
178   // significant bit of the result if the discarded portion is over half of the
179   // maximum.
180   if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
181     fraction += 1;
182   }
183   // Negate the fraction if the sign bit was set.
184   if (u & kSignMask) {
185     fraction *= -1;
186   }
187 
188   return fraction;
189 }
190 
DoubleFromFractionAndShift(int64_t fraction,int shift)191 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
192   union {
193     double double_value;
194     uint64_t double_as_uint;
195   } result;
196 
197   // Detect NaNs and infinities.
198   if (shift == std::numeric_limits<int>::max()) {
199     if (fraction == 0) {
200       return std::numeric_limits<double>::quiet_NaN();
201     } else if (fraction > 0) {
202       return std::numeric_limits<double>::infinity();
203     } else {
204       return -std::numeric_limits<double>::infinity();
205     }
206   }
207 
208   // Return a normalized zero for a zero fraction.
209   if (fraction == 0) {
210     result.double_as_uint = 0;
211     return result.double_value;
212   }
213 
214   bool is_negative = (fraction < 0);
215   int64_t encoded_fraction = is_negative ? -fraction : fraction;
216   int64_t encoded_shift = (shift - 1);
217   while (encoded_fraction < 0x40000000) {
218     encoded_fraction *= 2;
219     encoded_shift -= 1;
220   }
221   while (encoded_fraction > 0x80000000) {
222     encoded_fraction /= 2;
223     encoded_shift += 1;
224   }
225   encoded_fraction -= 0x40000000;
226   if (encoded_shift < -1022) {
227     encoded_shift = -1023;
228   } else if (encoded_shift > 1022) {
229     encoded_shift = 1023;
230   }
231   encoded_shift += kExponentBias;
232   uint64_t encoded_sign = is_negative ? kSignMask : 0;
233   result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
234                           (encoded_fraction << kFractionShift);
235   return result.double_value;
236 }
237 
IntegerDoubleMultiply(double a,double b)238 double IntegerDoubleMultiply(double a, double b) {
239   int a_shift;
240   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
241   int b_shift;
242   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
243   // Detect NaNs and infinities.
244   if (a_shift == std::numeric_limits<int>::max() ||
245       (b_shift == std::numeric_limits<int>::max())) {
246     return std::numeric_limits<double>::quiet_NaN();
247   }
248   const int result_shift = a_shift + b_shift + 1;
249   const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
250   return DoubleFromFractionAndShift(result_fraction, result_shift);
251 }
252 
IntegerDoubleCompare(double a,double b)253 int IntegerDoubleCompare(double a, double b) {
254   int a_shift;
255   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
256   int b_shift;
257   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
258 
259   // Detect NaNs and infinities.
260   if (a_shift == std::numeric_limits<int>::max() ||
261       (b_shift == std::numeric_limits<int>::max())) {
262     return 1;
263   }
264 
265   if ((a_fraction == 0) && (b_fraction < 0)) {
266     return 1;
267   } else if ((a_fraction < 0) && (b_fraction == 0)) {
268     return -1;
269   } else if (a_shift < b_shift) {
270     return -1;
271   } else if (a_shift > b_shift) {
272     return 1;
273   } else if (a_fraction < b_fraction) {
274     return -1;
275   } else if (a_fraction > b_fraction) {
276     return 1;
277   } else {
278     return 0;
279   }
280 }
281 
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)282 void PreprocessSoftmaxScaling(double beta, double input_scale,
283                               int input_integer_bits,
284                               int32_t* quantized_multiplier, int* left_shift) {
285   // If the overall multiplier (input and beta) is large, then exp() of an
286   // input difference of 1 scaled by this will be large.  In other words, we
287   // can cap the multiplier and know that, when it is used, the output will be
288   // (round to) zero wherever the input is not at the maximum value.
289 
290   // If the overall scale is less than one, and input_integer_bits=0, then the
291   // result is double equivalent of Q0.31 (actually with more precision). Thus
292   // this generates a Q(input_integer_bits).(31-input_integer_bits)
293   // representation.
294 #if TFLITE_SINGLE_ROUNDING
295   const double max_real_multiplier = (1LL << 30) - 1.0;
296 #else
297   const double max_real_multiplier = (1LL << 31) - 1.0;
298 #endif
299 
300 #ifdef TFLITE_EMULATE_FLOAT
301   const double input_beta = IntegerDoubleMultiply(beta, input_scale);
302   int shift;
303   int64_t fraction = IntegerFrExp(input_beta, &shift);
304   shift += (31 - input_integer_bits);
305   double input_beta_real_multiplier =
306       DoubleFromFractionAndShift(fraction, shift);
307   if (IntegerDoubleCompare(input_beta_real_multiplier, max_real_multiplier) >
308       0) {
309     input_beta_real_multiplier = max_real_multiplier;
310   }
311 #else   // TFLITE_EMULATE_FLOAT
312   const double input_beta_real_multiplier =
313       std::min<double>(beta * input_scale * (1 << (31 - input_integer_bits)),
314                        max_real_multiplier);
315 #endif  // TFLITE_EMULATE_FLOAT
316 
317   QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
318                                    quantized_multiplier, left_shift);
319 }
320 
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)321 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
322                                     int input_integer_bits,
323                                     int32_t* quantized_multiplier,
324                                     int* left_shift,
325                                     int32_t* reverse_scaling_divisor,
326                                     int* reverse_scaling_left_shift) {
327   PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
328                            quantized_multiplier, left_shift);
329 
330   // Also calculate what amounts to the inverse scaling factor for the input.
331   const double real_reverse_scaling_divisor =
332       (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
333   tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
334                                               reverse_scaling_divisor,
335                                               reverse_scaling_left_shift);
336 }
337 
CalculateInputRadius(int input_integer_bits,int input_left_shift,int total_signed_bits)338 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
339                          int total_signed_bits) {
340 #ifdef TFLITE_EMULATE_FLOAT
341   int64_t result = (1 << input_integer_bits) - 1;
342   result <<= (total_signed_bits - input_integer_bits);
343   result >>= input_left_shift;
344   return result;
345 #else   // TFLITE_EMULATE_FLOAT
346   const double max_input_rescaled =
347       1.0 * ((1 << input_integer_bits) - 1) *
348       (1LL << (total_signed_bits - input_integer_bits)) /
349       (1LL << input_left_shift);
350   // Tighten bound using floor.  Suppose that we could use the exact value.
351   // After scaling the difference, the result would be at the maximum.  Thus we
352   // must ensure that our value has lower magnitude.
353   return static_cast<int>(std::floor(max_input_rescaled));
354 #endif  // TFLITE_EMULATE_FLOAT
355 }
356 
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)357 void NudgeQuantizationRange(const float min, const float max,
358                             const int quant_min, const int quant_max,
359                             float* nudged_min, float* nudged_max,
360                             float* nudged_scale) {
361   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
362   const float quant_min_float = static_cast<float>(quant_min);
363   const float quant_max_float = static_cast<float>(quant_max);
364   *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
365   const float zero_point_from_min = quant_min_float - min / *nudged_scale;
366   uint16_t nudged_zero_point;
367   if (zero_point_from_min < quant_min_float) {
368     nudged_zero_point = static_cast<uint16_t>(quant_min);
369   } else if (zero_point_from_min > quant_max_float) {
370     nudged_zero_point = static_cast<uint16_t>(quant_max);
371   } else {
372     nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
373   }
374   *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
375   *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
376 }
377 
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)378 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
379                        const float nudged_max, const float* input_data,
380                        float* output_data, const float size) {
381   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
382   const float inv_nudged_scale = 1.0f / nudged_scale;
383 
384   for (int i = 0; i < size; i++) {
385     const float src_val = input_data[i];
386     const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
387     const float clamped_shifted = clamped - nudged_min;
388     const float dst_val =
389         TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
390         nudged_min;
391     output_data[i] = dst_val;
392   }
393 }
394 
CheckedLog2(const float x,int * log2_result)395 bool CheckedLog2(const float x, int* log2_result) {
396   // Using TfLiteRound instead of std::round and std::log instead of
397   // std::log2 to work around these functions being missing in a toolchain
398   // used in some TensorFlow tests as of May 2018.
399   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
400   const float x_log2_rounded = TfLiteRound(x_log2);
401   const float x_log2_fracpart = x_log2 - x_log2_rounded;
402 
403   *log2_result = static_cast<int>(x_log2_rounded);
404   return std::abs(x_log2_fracpart) < 1e-3f;
405 }
406 
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)407 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
408                              int32_t* effective_scale_significand,
409                              int* effective_shift) {
410   for (size_t i = 0; i < size; ++i) {
411     QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
412                        &effective_shift[i]);
413   }
414 }
415 
416 }  // namespace tflite
417