xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/mlir/lite/quantization/quantization_config.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // This header file defines node specs for quantization and the methods to parse
17 // command line flags to these specs.
18 
19 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
20 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
21 
22 #include <cstdint>
23 #include <string>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "absl/container/flat_hash_set.h"
28 #include "absl/strings/string_view.h"
29 #include "llvm/ADT/Optional.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "tensorflow/core/framework/types.pb.h"
32 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
33 
34 namespace mlir {
35 namespace quant {
36 
37 // Stores information about how to quantize a user-specified custom operation.
38 struct CustomOpInfo {
39   std::vector<std::int32_t> quantizable_input_indices;
40   bool is_weight_only = false;
41   bool no_side_effect = true;
42 };
43 
44 using ::tflite::optimize::ReducedPrecisionSupport;
45 using StringSet = absl::flat_hash_set<std::string>;
46 using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
47 enum CustomOpUpdateOptions { kINputIndices, kWeightOnly, kNoSideEffect };
48 
49 struct QuantizationSpecs {
50   // Which function this node quant specifications belong to.
51   std::string target_func = "main";
52 
53   // Whether the quantization passes are triggered for post-training
54   // quantization. If it is true, the model input doesn't require user specified
55   // input ranges.
56   bool post_training_quantization = false;
57 
58   // Whether allow dynamic range quantization. This is the easiest quantization
59   // mode which doesn't require QAT or sample inputs. But it can only target
60   // DT_HALF and DT_QINT8 inference type.
61   bool weight_quantization = false;
62 
63   // Whether use the MLIR dynamic range quantizer instead of the old TOCO one.
64   bool enable_mlir_dynamic_range_quantizer = false;
65 
66   // Whether allow weight-only quantization. This scheme quantize weights but
67   // will dequantize them back at runtime which is useful to save memory when
68   // the kernel support is not yet avilable in lower precisions. Used in MLIR
69   // dynamic range quantizer.
70   bool weight_only_quantization = false;
71 
72   // The minimum number of elements in a weights array required to apply
73   // quantization. This is especially useful not to quantize small tensors as
74   // it is hard to get performance benefits from them with quantization. Used
75   // in MLIR dynamic range quantizer with int8 weight data type.
76   int64_t minimum_elements_for_weights = 1024;
77 
78   // Calculate scales in float to keep quantized values the same with old TOCO
79   // quantizer.
80   bool legacy_float_scale = false;
81 
82   // When set to true, quantization will be done per-tensor. Currently, this
83   // option is only valid when the quantization parameters need to be created by
84   // scanning the constant content (post-training quantization or QAT without
85   // weight FakeQuant).
86   bool disable_per_channel = false;
87 
88   // When set to true, the fixed output ranges of the activation ops (tanh,
89   // sigmoid, etc.) and the weight constants are not inferred. Then, to quantize
90   // these ops, quantization emulation ops should be placed after the ops in the
91   // input graph. This flag should be set to false for post-training
92   // quantization.
93   bool disable_infer_tensor_range = false;
94 
95   // The node type when the model is exported. Currently this is limited to
96   // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
97   // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
98   // the `weight_quantization` flag needs to set to false.
99   tensorflow::DataType inference_type = tensorflow::DT_FLOAT;
100 
101   // The input and output data type during inference. This flag is only used
102   // when `inference_type` is different from DT_FLOAT. This flag can only be set
103   // to DT_FLOAT or as same as `inference_type`. If this flag is different
104   // from `inference_type`, adaptor ops are inserted as heading and tailing ops
105   // in the result model.
106   tensorflow::DataType inference_input_type = tensorflow::DT_FLOAT;
107 
108   // Input node ranges. These ranges are stored as the same order of function
109   // arguments. They are only used when `weight_quantization` is set to false,
110   // and the model is required to have quantization parameters, either from
111   // quantization aware training or calibration, for the remaining tensors.
112   std::vector<std::pair<llvm::Optional<double>, llvm::Optional<double>>>
113       input_ranges;
114 
115   // Whether to disable setting the quantization parameters of the input nodes
116   // using input ranges.
117   bool disable_set_input_nodes_quantization_params = false;
118 
119   // The default ranges can be used when a tensor doesn't have quantization
120   // parameters and couldn't be quantized. Used only for latency tests.
121   std::pair<llvm::Optional<double>, llvm::Optional<double>> default_ranges;
122 
123   // A serialized "QuantizationInfo" object to specify value ranges for some of
124   // the tensors with known names.
125   std::string serialized_quant_stats = "";
126 
127   // A bitmask to encode support for reduced precision inference in the model.
128   ReducedPrecisionSupport support_mask = ReducedPrecisionSupport::None;
129 
130   // Whether run the passes to propagate the quantization parameters and graph
131   // rewrites. Returns false if the inference_type is DT_FLOAT or
132   // `weight_quantization` flag is set.
RunPropagationAndRewriteQuantizationPassesQuantizationSpecs133   bool RunPropagationAndRewriteQuantizationPasses() const {
134     return inference_type != tensorflow::DT_FLOAT && !weight_quantization;
135   }
136 
137   // TODO(b/202075505): make implicit weight type clearer
138   // Whether run the passes and graph rewrites for dynamic range quantization.
RunAndRewriteDynamicRangeQuantizationPassesQuantizationSpecs139   bool RunAndRewriteDynamicRangeQuantizationPasses() const {
140     // TODO(b/201389248): add condition that symmetric, signed, int8 only
141     // If fail, log will appear to let user know nothing happened.
142     bool dynamic_range_quantize =
143         (inference_type != tensorflow::DT_FLOAT) && weight_quantization &&
144         !post_training_quantization && !disable_infer_tensor_range &&
145         enable_mlir_dynamic_range_quantizer;
146     return dynamic_range_quantize;
147   }
148 
149   // Whether this inference type represents a signed storage type.
IsSignedInferenceTypeQuantizationSpecs150   bool IsSignedInferenceType() const {
151     switch (inference_type) {
152       case tensorflow::DT_QUINT8:
153       case tensorflow::DT_QUINT16:
154         return false;
155       default:
156         return true;
157     }
158   }
159 
160   // Gets the width of this quantization type. Returns 0 if it isn't a
161   // quantization type.
GetQuantizationTypeWidthQuantizationSpecs162   int64_t GetQuantizationTypeWidth() const {
163     switch (inference_type) {
164       case tensorflow::DT_QINT8:
165       case tensorflow::DT_QUINT8:
166         return 8;
167       case tensorflow::DT_QINT16:
168       case tensorflow::DT_QUINT16:
169         return 16;
170       case tensorflow::DT_QINT32:
171         return 32;
172       default:
173         return 0;
174     }
175   }
176 
177   // Whether add the NumericVerify ops to verify numbers before and after
178   // quantization.
179   bool verify_numeric = false;
180   // Whether to add verification for layer by layer, or on whole model. When
181   // disabled (per-layer) float and quantized ops will be run from same input
182   // (output of previous quantized layer). When enabled, float and quantized ops
183   // will run with respective float and quantized output of previous ops.
184   bool whole_model_verify = false;
185 
186   // Whether to use fake quant attributes to calculate quantization parameters.
187   bool use_fake_quant_num_bits = false;
188 
189   // Names of ops to block from quantization. Used in QuantizePass.
190   // For dynamic range quantization, ops in blocklist are quantized in weight-
191   // only manner.
192   StringSet ops_blocklist;
193 
194   // Names of locations to block from quantization. Used in QuantizePass.
195   StringSet nodes_blocklist;
196 
197   // Map from custom op code to custom op quantization information.
198   // For dynamic range quantization, among the custom ops in the graph those
199   // specified in this map are subject to quantization.
200   CustomOpMap custom_map;
201 };
202 
203 // Parses the command line flag strings to the CustomOpMap specification.
204 void ParseCustomOpSpecs(absl::string_view node_names,
205                         const CustomOpUpdateOptions& update_option,
206                         CustomOpMap& custom_op_map);
207 
208 // Parses the command line flag strings to the quantization specification for
209 // input arrays of a graph. The array names are not stored in the spec, and will
210 // be matched by position. Returns true if failed.
211 bool ParseInputNodeQuantSpecs(absl::string_view node_names,
212                               absl::string_view min_values,
213                               absl::string_view max_values,
214                               absl::string_view inference_type,
215                               QuantizationSpecs* quant_specs);
216 
217 // Gets the quantization specification for input arrays. The array names are not
218 // stored in the spec, and will be matched by position. The min/max will be
219 // ignored if the inference_type isn't a quantized type. Returns true if failed.
220 bool GetInputNodeQuantSpecs(
221     const std::vector<std::string>& node_names,
222     const std::vector<llvm::Optional<double>>& node_mins,
223     const std::vector<llvm::Optional<double>>& node_maxs,
224     tensorflow::DataType inference_type, QuantizationSpecs* quant_specs);
225 
226 }  // namespace quant
227 }  // namespace mlir
228 
229 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
230