1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/kernels/internal/quantization_util.h"
17
18 #include <algorithm>
19 #include <cmath>
20 #include <limits>
21
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/cppmath.h"
24
25 namespace tflite {
26
27 namespace {
28 // These constants are used to manipulate the binary representation of doubles.
29 // Double-precision binary64 floating point format is:
30 // Bit | 63 | 62-52 | 51-0 |
31 // | Sign | Exponent | Fraction |
32 // To avoid 64-bit integers as much as possible, I break this into high and
33 // low 32-bit chunks. High is:
34 // Bit | 31 | 30-20 | 19-0 |
35 // | Sign | Exponent | High Fraction |
36 // Low is:
37 // Bit | 31-0 |
38 // | Low Fraction |
39 // We then access the components through logical bit-wise operations to
40 // extract the parts needed, with the positions and masks derived from the
41 // layout shown above.
42 constexpr uint64_t kSignMask = 0x8000000000000000LL;
43 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
44 constexpr int32_t kExponentShift = 52;
45 constexpr int32_t kExponentBias = 1023;
46 constexpr uint32_t kExponentIsBadNum = 0x7ff;
47 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
48 constexpr uint32_t kFractionShift = 22;
49 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
50 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
51 } // namespace
52
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)53 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
54 int* shift) {
55 #if TFLITE_SINGLE_ROUNDING
56 // Single-rounding MultiplyByQuantizedMultiplier only supports positive
57 // multipliers.
58 // TFLITE_DCHECK(double_multiplier >= 0);
59 #endif
60 if (double_multiplier == 0.) {
61 *quantized_multiplier = 0;
62 *shift = 0;
63 return;
64 }
65 #ifdef TFLITE_EMULATE_FLOAT
66 // If we're trying to avoid the use of floating-point instructions (for
67 // example on microcontrollers) then use an alternative implementation
68 // that only requires integer and bitwise operations. To enable this, you
69 // need to set the define during the build process for your platform.
70 int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
71 #else // TFLITE_EMULATE_FLOAT
72 const double q = std::frexp(double_multiplier, shift);
73 auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1LL << 31)));
74 #endif // TFLITE_EMULATE_FLOAT
75 TFLITE_CHECK(q_fixed <= (1LL << 31));
76 if (q_fixed == (1LL << 31)) {
77 q_fixed /= 2;
78 ++*shift;
79 }
80 TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
81 // A shift amount smaller than -31 would cause all bits to be shifted out
82 // and thus all results would be zero. We implement that instead with
83 // q_fixed==0, so as to avoid hitting issues with right-shift
84 // operations with shift amounts greater than 31. Note that this happens
85 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
86 // that we're effectively flushing tiny double_multiplier's to zero.
87 // We could conceivably handle values in the range (roughly) [32, 63]
88 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
89 // the present handling is just doing 'flush denormals to zero'. We could
90 // reconsider and actually generate nonzero denormals if a need arises.
91 if (*shift < -31) {
92 *shift = 0;
93 q_fixed = 0;
94 }
95 #if TFLITE_SINGLE_ROUNDING
96 // Single-rounding MultiplyByQuantizedMultiplier doesn't support a shift > 30,
97 // saturate it.
98 if (*shift > 30) {
99 *shift = 30;
100 q_fixed = (1LL << 31) - 1;
101 }
102 #endif
103 *quantized_multiplier = static_cast<int32_t>(q_fixed);
104 }
105
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)106 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
107 int32_t* quantized_multiplier,
108 int* left_shift) {
109 TFLITE_CHECK_GT(double_multiplier, 1.);
110 QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
111 TFLITE_CHECK_GE(*left_shift, 0);
112 }
113
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)114 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
115 int32_t* quantized_multiplier,
116 int* left_shift) {
117 TFLITE_CHECK_LT(double_multiplier, 1.);
118 TFLITE_CHECK_GT(double_multiplier, 0.);
119 int shift;
120 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
121 TFLITE_CHECK_LE(shift, 0);
122 *left_shift = shift;
123 }
124
IntegerFrExp(double input,int * shift)125 int64_t IntegerFrExp(double input, int* shift) {
126 // Make sure our assumptions about the double layout hold.
127 TFLITE_CHECK_EQ(8, sizeof(double));
128
129 // We want to access the bits of the input double value directly, which is
130 // tricky to do safely, so use a union to handle the casting.
131 union {
132 double double_value;
133 uint64_t double_as_uint;
134 } cast_union;
135 cast_union.double_value = input;
136 const uint64_t u = cast_union.double_as_uint;
137
138 // If the bitfield is all zeros apart from the sign bit, this is a normalized
139 // zero value, so return standard values for this special case.
140 if ((u & ~kSignMask) == 0) {
141 *shift = 0;
142 return 0;
143 }
144
145 // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
146 // the exponent, and distinguished by whether the fractions are zero or
147 // non-zero.
148 const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
149 if (exponent_part == kExponentIsBadNum) {
150 *shift = std::numeric_limits<int>::max();
151 if (u & kFractionMask) {
152 // NaN, so just return zero (with the exponent set to INT_MAX).
153 return 0;
154 } else {
155 // Infinity, so return +/- INT_MAX.
156 if (u & kSignMask) {
157 return std::numeric_limits<int64_t>::min();
158 } else {
159 return std::numeric_limits<int64_t>::max();
160 }
161 }
162 }
163
164 // The shift is fairly easy to extract from the high bits of the double value,
165 // just by masking it out and applying a bias. The std::frexp() implementation
166 // always returns values between 0.5 and 1.0 though, whereas the exponent
167 // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
168 // interface.
169 *shift = (exponent_part - kExponentBias) + 1;
170
171 // There's an implicit high bit in the double format definition, so make sure
172 // we include that at the top, and then reconstruct the rest of the fractional
173 // value from the remaining fragments.
174 int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
175
176 // We're cutting off some bits at the bottom, so to exactly match the standard
177 // frexp implementation here we'll apply rounding by adding one to the least
178 // significant bit of the result if the discarded portion is over half of the
179 // maximum.
180 if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
181 fraction += 1;
182 }
183 // Negate the fraction if the sign bit was set.
184 if (u & kSignMask) {
185 fraction *= -1;
186 }
187
188 return fraction;
189 }
190
DoubleFromFractionAndShift(int64_t fraction,int shift)191 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
192 union {
193 double double_value;
194 uint64_t double_as_uint;
195 } result;
196
197 // Detect NaNs and infinities.
198 if (shift == std::numeric_limits<int>::max()) {
199 if (fraction == 0) {
200 return std::numeric_limits<double>::quiet_NaN();
201 } else if (fraction > 0) {
202 return std::numeric_limits<double>::infinity();
203 } else {
204 return -std::numeric_limits<double>::infinity();
205 }
206 }
207
208 // Return a normalized zero for a zero fraction.
209 if (fraction == 0) {
210 result.double_as_uint = 0;
211 return result.double_value;
212 }
213
214 bool is_negative = (fraction < 0);
215 int64_t encoded_fraction = is_negative ? -fraction : fraction;
216 int64_t encoded_shift = (shift - 1);
217 while (encoded_fraction < 0x40000000) {
218 encoded_fraction *= 2;
219 encoded_shift -= 1;
220 }
221 while (encoded_fraction > 0x80000000) {
222 encoded_fraction /= 2;
223 encoded_shift += 1;
224 }
225 encoded_fraction -= 0x40000000;
226 if (encoded_shift < -1022) {
227 encoded_shift = -1023;
228 } else if (encoded_shift > 1022) {
229 encoded_shift = 1023;
230 }
231 encoded_shift += kExponentBias;
232 uint64_t encoded_sign = is_negative ? kSignMask : 0;
233 result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
234 (encoded_fraction << kFractionShift);
235 return result.double_value;
236 }
237
IntegerDoubleMultiply(double a,double b)238 double IntegerDoubleMultiply(double a, double b) {
239 int a_shift;
240 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
241 int b_shift;
242 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
243 // Detect NaNs and infinities.
244 if (a_shift == std::numeric_limits<int>::max() ||
245 (b_shift == std::numeric_limits<int>::max())) {
246 return std::numeric_limits<double>::quiet_NaN();
247 }
248 const int result_shift = a_shift + b_shift + 1;
249 const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
250 return DoubleFromFractionAndShift(result_fraction, result_shift);
251 }
252
IntegerDoubleCompare(double a,double b)253 int IntegerDoubleCompare(double a, double b) {
254 int a_shift;
255 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
256 int b_shift;
257 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
258
259 // Detect NaNs and infinities.
260 if (a_shift == std::numeric_limits<int>::max() ||
261 (b_shift == std::numeric_limits<int>::max())) {
262 return 1;
263 }
264
265 if ((a_fraction == 0) && (b_fraction < 0)) {
266 return 1;
267 } else if ((a_fraction < 0) && (b_fraction == 0)) {
268 return -1;
269 } else if (a_shift < b_shift) {
270 return -1;
271 } else if (a_shift > b_shift) {
272 return 1;
273 } else if (a_fraction < b_fraction) {
274 return -1;
275 } else if (a_fraction > b_fraction) {
276 return 1;
277 } else {
278 return 0;
279 }
280 }
281
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)282 void PreprocessSoftmaxScaling(double beta, double input_scale,
283 int input_integer_bits,
284 int32_t* quantized_multiplier, int* left_shift) {
285 // If the overall multiplier (input and beta) is large, then exp() of an
286 // input difference of 1 scaled by this will be large. In other words, we
287 // can cap the multiplier and know that, when it is used, the output will be
288 // (round to) zero wherever the input is not at the maximum value.
289
290 // If the overall scale is less than one, and input_integer_bits=0, then the
291 // result is double equivalent of Q0.31 (actually with more precision). Thus
292 // this generates a Q(input_integer_bits).(31-input_integer_bits)
293 // representation.
294 #if TFLITE_SINGLE_ROUNDING
295 const double max_real_multiplier = (1LL << 30) - 1.0;
296 #else
297 const double max_real_multiplier = (1LL << 31) - 1.0;
298 #endif
299
300 #ifdef TFLITE_EMULATE_FLOAT
301 const double input_beta = IntegerDoubleMultiply(beta, input_scale);
302 int shift;
303 int64_t fraction = IntegerFrExp(input_beta, &shift);
304 shift += (31 - input_integer_bits);
305 double input_beta_real_multiplier =
306 DoubleFromFractionAndShift(fraction, shift);
307 if (IntegerDoubleCompare(input_beta_real_multiplier, max_real_multiplier) >
308 0) {
309 input_beta_real_multiplier = max_real_multiplier;
310 }
311 #else // TFLITE_EMULATE_FLOAT
312 const double input_beta_real_multiplier =
313 std::min<double>(beta * input_scale * (1 << (31 - input_integer_bits)),
314 max_real_multiplier);
315 #endif // TFLITE_EMULATE_FLOAT
316
317 QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
318 quantized_multiplier, left_shift);
319 }
320
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)321 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
322 int input_integer_bits,
323 int32_t* quantized_multiplier,
324 int* left_shift,
325 int32_t* reverse_scaling_divisor,
326 int* reverse_scaling_left_shift) {
327 PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
328 quantized_multiplier, left_shift);
329
330 // Also calculate what amounts to the inverse scaling factor for the input.
331 const double real_reverse_scaling_divisor =
332 (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
333 tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
334 reverse_scaling_divisor,
335 reverse_scaling_left_shift);
336 }
337
CalculateInputRadius(int input_integer_bits,int input_left_shift,int total_signed_bits)338 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
339 int total_signed_bits) {
340 #ifdef TFLITE_EMULATE_FLOAT
341 int64_t result = (1 << input_integer_bits) - 1;
342 result <<= (total_signed_bits - input_integer_bits);
343 result >>= input_left_shift;
344 return result;
345 #else // TFLITE_EMULATE_FLOAT
346 const double max_input_rescaled =
347 1.0 * ((1 << input_integer_bits) - 1) *
348 (1LL << (total_signed_bits - input_integer_bits)) /
349 (1LL << input_left_shift);
350 // Tighten bound using floor. Suppose that we could use the exact value.
351 // After scaling the difference, the result would be at the maximum. Thus we
352 // must ensure that our value has lower magnitude.
353 return static_cast<int>(std::floor(max_input_rescaled));
354 #endif // TFLITE_EMULATE_FLOAT
355 }
356
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)357 void NudgeQuantizationRange(const float min, const float max,
358 const int quant_min, const int quant_max,
359 float* nudged_min, float* nudged_max,
360 float* nudged_scale) {
361 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
362 const float quant_min_float = static_cast<float>(quant_min);
363 const float quant_max_float = static_cast<float>(quant_max);
364 *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
365 const float zero_point_from_min = quant_min_float - min / *nudged_scale;
366 uint16_t nudged_zero_point;
367 if (zero_point_from_min < quant_min_float) {
368 nudged_zero_point = static_cast<uint16_t>(quant_min);
369 } else if (zero_point_from_min > quant_max_float) {
370 nudged_zero_point = static_cast<uint16_t>(quant_max);
371 } else {
372 nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
373 }
374 *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
375 *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
376 }
377
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)378 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
379 const float nudged_max, const float* input_data,
380 float* output_data, const float size) {
381 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
382 const float inv_nudged_scale = 1.0f / nudged_scale;
383
384 for (int i = 0; i < size; i++) {
385 const float src_val = input_data[i];
386 const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
387 const float clamped_shifted = clamped - nudged_min;
388 const float dst_val =
389 TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
390 nudged_min;
391 output_data[i] = dst_val;
392 }
393 }
394
CheckedLog2(const float x,int * log2_result)395 bool CheckedLog2(const float x, int* log2_result) {
396 // Using TfLiteRound instead of std::round and std::log instead of
397 // std::log2 to work around these functions being missing in a toolchain
398 // used in some TensorFlow tests as of May 2018.
399 const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
400 const float x_log2_rounded = TfLiteRound(x_log2);
401 const float x_log2_fracpart = x_log2 - x_log2_rounded;
402
403 *log2_result = static_cast<int>(x_log2_rounded);
404 return std::abs(x_log2_fracpart) < 1e-3f;
405 }
406
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)407 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
408 int32_t* effective_scale_significand,
409 int* effective_shift) {
410 for (size_t i = 0; i < size; ++i) {
411 QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
412 &effective_shift[i]);
413 }
414 }
415
416 } // namespace tflite
417