xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
17 
18 #include <algorithm>
19 
20 #include "tensorflow/lite/kernels/internal/common.h"
21 
22 namespace tflite {
23 namespace reference_integer_ops {
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,int8_t * output_data)24 inline void DepthwiseConvPerChannel(
25     const DepthwiseParams& params, const int32_t* output_multiplier,
26     const int32_t* output_shift, const RuntimeShape& input_shape,
27     const int8_t* input_data, const RuntimeShape& filter_shape,
28     const int8_t* filter_data, const RuntimeShape& bias_shape,
29     const int32_t* bias_data, const RuntimeShape& output_shape,
30     int8_t* output_data) {
31   // Get parameters.
32   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
33   const int stride_width = params.stride_width;
34   const int stride_height = params.stride_height;
35   const int dilation_width_factor = params.dilation_width_factor;
36   const int dilation_height_factor = params.dilation_height_factor;
37   const int pad_width = params.padding_values.width;
38   const int pad_height = params.padding_values.height;
39   const int depth_multiplier = params.depth_multiplier;
40   const int32_t input_offset = params.input_offset;
41   const int32_t output_offset = params.output_offset;
42   const int32_t output_activation_min = params.quantized_activation_min;
43   const int32_t output_activation_max = params.quantized_activation_max;
44 
45   // Check dimensions of the tensors.
46   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
47   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
48   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
49 
50   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
51   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
52   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
53   const int input_height = input_shape.Dims(1);
54   const int input_width = input_shape.Dims(2);
55   const int input_depth = input_shape.Dims(3);
56   const int filter_height = filter_shape.Dims(1);
57   const int filter_width = filter_shape.Dims(2);
58   const int output_height = output_shape.Dims(1);
59   const int output_width = output_shape.Dims(2);
60   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
61   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
62 
63   for (int batch = 0; batch < batches; ++batch) {
64     for (int out_y = 0; out_y < output_height; ++out_y) {
65       for (int out_x = 0; out_x < output_width; ++out_x) {
66         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
67           for (int m = 0; m < depth_multiplier; ++m) {
68             const int output_channel = m + in_channel * depth_multiplier;
69             const int in_x_origin = (out_x * stride_width) - pad_width;
70             const int in_y_origin = (out_y * stride_height) - pad_height;
71             int32_t acc = 0;
72             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
73               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
74                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
75                 const int in_y =
76                     in_y_origin + dilation_height_factor * filter_y;
77                 // Zero padding by omitting the areas outside the image.
78                 const bool is_point_inside_image =
79                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
80                     (in_y < input_height);
81                 if (is_point_inside_image) {
82                   int32_t input_val = input_data[Offset(
83                       input_shape, batch, in_y, in_x, in_channel)];
84                   int32_t filter_val = filter_data[Offset(
85                       filter_shape, 0, filter_y, filter_x, output_channel)];
86                   // Accumulate with 32 bits accumulator.
87                   // In the nudging process during model quantization, we force
88                   // real value of 0.0 be represented by a quantized value. This
89                   // guarantees that the input_offset is a int8_t, even though
90                   // it is represented using int32_t. int32_t += int8_t *
91                   // (int8_t - int8_t) so the highest value we can get from each
92                   // accumulation is [-127, 127] * ([-128, 127] -
93                   // [-128, 127]), which is [-32512, 32512]. log2(32512)
94                   // = 14.98, which means we can accumulate at least 2^16
95                   // multiplications without overflow. The accumulator is
96                   // applied to a filter so the accumulation logic will hold as
97                   // long as the filter size (filter_y * filter_x * in_channel)
98                   // does not exceed 2^16, which is the case in all the models
99                   // we have seen so far.
100                   // TODO(b/174275578): Add a check to make sure the
101                   // accumulator depth is smaller than 2^16.
102                   acc += filter_val * (input_val + input_offset);
103                 }
104               }
105             }
106             if (bias_data) {
107               acc += bias_data[output_channel];
108             }
109             acc = MultiplyByQuantizedMultiplier(
110                 acc, output_multiplier[output_channel],
111                 output_shift[output_channel]);
112             acc += output_offset;
113             acc = std::max(acc, output_activation_min);
114             acc = std::min(acc, output_activation_max);
115             output_data[Offset(output_shape, batch, out_y, out_x,
116                                output_channel)] = static_cast<int8_t>(acc);
117           }
118         }
119       }
120     }
121   }
122 }
123 
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int16_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const std::int64_t * bias_data,const RuntimeShape & output_shape,int16_t * output_data)124 inline void DepthwiseConvPerChannel(
125     const DepthwiseParams& params, const int32_t* output_multiplier,
126     const int32_t* output_shift, const RuntimeShape& input_shape,
127     const int16_t* input_data, const RuntimeShape& filter_shape,
128     const int8_t* filter_data, const RuntimeShape& bias_shape,
129     const std::int64_t* bias_data, const RuntimeShape& output_shape,
130     int16_t* output_data) {
131   // Get parameters.
132   const int stride_width = params.stride_width;
133   const int stride_height = params.stride_height;
134   const int dilation_width_factor = params.dilation_width_factor;
135   const int dilation_height_factor = params.dilation_height_factor;
136   const int pad_width = params.padding_values.width;
137   const int pad_height = params.padding_values.height;
138   const int depth_multiplier = params.depth_multiplier;
139   const int32_t output_activation_min = params.quantized_activation_min;
140   const int32_t output_activation_max = params.quantized_activation_max;
141 
142   // Check dimensions of the tensors.
143   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
144   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
145   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
146 
147   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
148   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
149   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
150   const int input_height = input_shape.Dims(1);
151   const int input_width = input_shape.Dims(2);
152   const int input_depth = input_shape.Dims(3);
153   const int filter_height = filter_shape.Dims(1);
154   const int filter_width = filter_shape.Dims(2);
155   const int output_height = output_shape.Dims(1);
156   const int output_width = output_shape.Dims(2);
157   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
158   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
159 
160   for (int batch = 0; batch < batches; ++batch) {
161     for (int out_y = 0; out_y < output_height; ++out_y) {
162       for (int out_x = 0; out_x < output_width; ++out_x) {
163         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
164           for (int m = 0; m < depth_multiplier; ++m) {
165             const int output_channel = m + in_channel * depth_multiplier;
166             const int in_x_origin = (out_x * stride_width) - pad_width;
167             const int in_y_origin = (out_y * stride_height) - pad_height;
168             std::int64_t acc = 0;
169             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
170               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
171                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
172                 const int in_y =
173                     in_y_origin + dilation_height_factor * filter_y;
174                 // Zero padding by omitting the areas outside the image.
175                 const bool is_point_inside_image =
176                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
177                     (in_y < input_height);
178                 if (is_point_inside_image) {
179                   int32_t input_val = input_data[Offset(
180                       input_shape, batch, in_y, in_x, in_channel)];
181                   int32_t filter_val = filter_data[Offset(
182                       filter_shape, 0, filter_y, filter_x, output_channel)];
183                   // Accumulate with 64 bits accumulator.
184                   // We assume maximum of 2^16 accumulations as with the 8-bit
185                   // case so actually the value in the accumulator should not
186                   // exceed 40 bits
187                   acc += static_cast<int64_t>(filter_val) *
188                          static_cast<int64_t>(input_val);
189                 }
190               }
191             }
192             if (bias_data) {
193               acc += bias_data[output_channel];
194             }
195             int32_t scaled_acc = MultiplyByQuantizedMultiplier(
196                 acc, output_multiplier[output_channel],
197                 output_shift[output_channel]);
198             scaled_acc = std::max(scaled_acc, output_activation_min);
199             scaled_acc = std::min(scaled_acc, output_activation_max);
200             output_data[Offset(output_shape, batch, out_y, out_x,
201                                output_channel)] =
202                 static_cast<int16_t>(scaled_acc);
203           }
204         }
205       }
206     }
207   }
208 }
209 
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,float * scaling_factors_ptr,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scale,int32_t * input_offset)210 inline void DepthwiseConvHybridPerChannel(
211     const DepthwiseParams& params, float* scaling_factors_ptr,
212     const RuntimeShape& input_shape, const int8_t* input_data,
213     const RuntimeShape& filter_shape, const int8_t* filter_data,
214     const RuntimeShape& bias_shape, const float* bias_data,
215     const RuntimeShape& output_shape, float* output_data,
216     const float* per_channel_scale, int32_t* input_offset) {
217   const int stride_width = params.stride_width;
218   const int stride_height = params.stride_height;
219   const int dilation_width_factor = params.dilation_width_factor;
220   const int dilation_height_factor = params.dilation_height_factor;
221   const int pad_width = params.padding_values.width;
222   const int pad_height = params.padding_values.height;
223   const int depth_multiplier = params.depth_multiplier;
224   const float output_activation_min = params.float_activation_min;
225   const float output_activation_max = params.float_activation_max;
226   // Check dimensions of the tensors.
227   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
228   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
229   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
230 
231   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
232   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
233   const int input_height = input_shape.Dims(1);
234   const int input_width = input_shape.Dims(2);
235   const int input_depth = input_shape.Dims(3);
236   const int filter_height = filter_shape.Dims(1);
237   const int filter_width = filter_shape.Dims(2);
238   const int output_height = output_shape.Dims(1);
239   const int output_width = output_shape.Dims(2);
240   const int bias_depth = bias_shape.FlatSize();
241   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
242   TFLITE_DCHECK_EQ(bias_depth, output_depth);
243 
244   for (int batch = 0; batch < batches; ++batch) {
245     for (int out_y = 0; out_y < output_height; ++out_y) {
246       for (int out_x = 0; out_x < output_width; ++out_x) {
247         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
248           for (int m = 0; m < depth_multiplier; ++m) {
249             const int output_channel = m + in_channel * depth_multiplier;
250             const int in_x_origin = (out_x * stride_width) - pad_width;
251             const int in_y_origin = (out_y * stride_height) - pad_height;
252             int32_t acc = 0;
253             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
254               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
255                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
256                 const int in_y =
257                     in_y_origin + dilation_height_factor * filter_y;
258                 // Zero padding by omitting the areas outside the image.
259                 const bool is_point_inside_image =
260                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
261                     (in_y < input_height);
262                 if (is_point_inside_image) {
263                   int32_t input_val = input_data[Offset(
264                       input_shape, batch, in_y, in_x, in_channel)];
265                   int32_t filter_val = filter_data[Offset(
266                       filter_shape, 0, filter_y, filter_x, output_channel)];
267                   acc += filter_val * (input_val - input_offset[batch]);
268                 }
269               }
270             }
271             float acc_float = static_cast<float>(acc);
272             acc_float *=
273                 per_channel_scale[output_channel] * scaling_factors_ptr[batch];
274             if (bias_data && output_channel < bias_depth) {
275               acc_float += bias_data[output_channel];
276             }
277             output_data[Offset(output_shape, batch, out_y, out_x,
278                                output_channel)] =
279                 ActivationFunctionWithMinMax(acc_float, output_activation_min,
280                                              output_activation_max);
281           }
282         }
283       }
284     }
285   }
286 }
287 
288 }  // namespace reference_integer_ops
289 }  // namespace tflite
290 
291 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
292