1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
16
17 #include <assert.h>
18
19 #include <algorithm>
20 #include <cmath>
21 #include <limits>
22 #include <optional>
23
24 #include "absl/types/optional.h"
25
26 namespace mlir {
27 namespace quant {
28
29 // This method is adopted from TFLite:
30 // ["tensorflow/lite/kernels/internal/quantization_util.cc"]
QuantizeMultiplier(double double_multiplier)31 QuantizedMultiplier QuantizeMultiplier(double double_multiplier) {
32 if (double_multiplier < 1e-6) {
33 return {0, 0};
34 }
35
36 int32_t shift;
37 const double q = frexp(double_multiplier, &shift);
38 auto q_fixed = static_cast<int64_t>(round(q * (1LL << 31)));
39 assert(q_fixed <= (1LL << 31));
40 if (q_fixed == (1LL << 31)) {
41 q_fixed /= 2;
42 ++shift;
43 }
44 assert(q_fixed <= std::numeric_limits<int32_t>::max());
45 // A shift amount smaller than -31 would cause all bits to be shifted out
46 // and thus all results would be zero. We implement that instead with
47 // q_fixed==0, so as to avoid hitting issues with right-shift
48 // operations with shift amounts greater than 31. Note that this happens
49 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
50 // that we're effectively flushing tiny double_multiplier's to zero.
51 // We could conceivably handle values in the range (roughly) [32, 63]
52 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
53 // the present handling is just doing 'flush denormals to zero'. We could
54 // reconsider and actually generate nonzero denormals if a need arises.
55 if (shift < -31) {
56 shift = 0;
57 q_fixed = 0;
58 }
59 return {static_cast<int32_t>(q_fixed), shift};
60 }
61
CalculateQuantizedRange(double scale,int32_t zero_point,std::optional<double> rmin,std::optional<double> rmax,int32_t qmin,int32_t qmax)62 QuantizedRange CalculateQuantizedRange(double scale, int32_t zero_point,
63 std::optional<double> rmin,
64 std::optional<double> rmax, int32_t qmin,
65 int32_t qmax) {
66 auto quantize = [scale, zero_point](float f) {
67 return zero_point + static_cast<int32_t>(std::round(f / scale));
68 };
69
70 if (rmin.has_value() && rmax.has_value()) {
71 return {std::max(qmin, quantize(rmin.value())),
72 std::min(qmax, quantize(rmax.value()))};
73 } else if (rmin.has_value()) {
74 return {std::max(qmin, quantize(rmin.value())), qmax};
75 } else if (rmax.has_value()) {
76 return {qmin, std::min(qmax, quantize(rmax.value()))};
77 } else {
78 return {qmin, qmax};
79 }
80 }
81
82 } // namespace quant
83 } // namespace mlir
84