1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
17
18 #include <algorithm>
19
20 #include "tensorflow/lite/kernels/internal/common.h"
21
22 namespace tflite {
23 namespace reference_integer_ops {
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,int8_t * output_data)24 inline void DepthwiseConvPerChannel(
25 const DepthwiseParams& params, const int32_t* output_multiplier,
26 const int32_t* output_shift, const RuntimeShape& input_shape,
27 const int8_t* input_data, const RuntimeShape& filter_shape,
28 const int8_t* filter_data, const RuntimeShape& bias_shape,
29 const int32_t* bias_data, const RuntimeShape& output_shape,
30 int8_t* output_data) {
31 // Get parameters.
32 // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
33 const int stride_width = params.stride_width;
34 const int stride_height = params.stride_height;
35 const int dilation_width_factor = params.dilation_width_factor;
36 const int dilation_height_factor = params.dilation_height_factor;
37 const int pad_width = params.padding_values.width;
38 const int pad_height = params.padding_values.height;
39 const int depth_multiplier = params.depth_multiplier;
40 const int32_t input_offset = params.input_offset;
41 const int32_t output_offset = params.output_offset;
42 const int32_t output_activation_min = params.quantized_activation_min;
43 const int32_t output_activation_max = params.quantized_activation_max;
44
45 // Check dimensions of the tensors.
46 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
47 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
48 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
49
50 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
51 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
52 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
53 const int input_height = input_shape.Dims(1);
54 const int input_width = input_shape.Dims(2);
55 const int input_depth = input_shape.Dims(3);
56 const int filter_height = filter_shape.Dims(1);
57 const int filter_width = filter_shape.Dims(2);
58 const int output_height = output_shape.Dims(1);
59 const int output_width = output_shape.Dims(2);
60 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
61 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
62
63 for (int batch = 0; batch < batches; ++batch) {
64 for (int out_y = 0; out_y < output_height; ++out_y) {
65 for (int out_x = 0; out_x < output_width; ++out_x) {
66 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
67 for (int m = 0; m < depth_multiplier; ++m) {
68 const int output_channel = m + in_channel * depth_multiplier;
69 const int in_x_origin = (out_x * stride_width) - pad_width;
70 const int in_y_origin = (out_y * stride_height) - pad_height;
71 int32_t acc = 0;
72 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
73 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
74 const int in_x = in_x_origin + dilation_width_factor * filter_x;
75 const int in_y =
76 in_y_origin + dilation_height_factor * filter_y;
77 // Zero padding by omitting the areas outside the image.
78 const bool is_point_inside_image =
79 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
80 (in_y < input_height);
81 if (is_point_inside_image) {
82 int32_t input_val = input_data[Offset(
83 input_shape, batch, in_y, in_x, in_channel)];
84 int32_t filter_val = filter_data[Offset(
85 filter_shape, 0, filter_y, filter_x, output_channel)];
86 // Accumulate with 32 bits accumulator.
87 // In the nudging process during model quantization, we force
88 // real value of 0.0 be represented by a quantized value. This
89 // guarantees that the input_offset is a int8_t, even though
90 // it is represented using int32_t. int32_t += int8_t *
91 // (int8_t - int8_t) so the highest value we can get from each
92 // accumulation is [-127, 127] * ([-128, 127] -
93 // [-128, 127]), which is [-32512, 32512]. log2(32512)
94 // = 14.98, which means we can accumulate at least 2^16
95 // multiplications without overflow. The accumulator is
96 // applied to a filter so the accumulation logic will hold as
97 // long as the filter size (filter_y * filter_x * in_channel)
98 // does not exceed 2^16, which is the case in all the models
99 // we have seen so far.
100 // TODO(b/174275578): Add a check to make sure the
101 // accumulator depth is smaller than 2^16.
102 acc += filter_val * (input_val + input_offset);
103 }
104 }
105 }
106 if (bias_data) {
107 acc += bias_data[output_channel];
108 }
109 acc = MultiplyByQuantizedMultiplier(
110 acc, output_multiplier[output_channel],
111 output_shift[output_channel]);
112 acc += output_offset;
113 acc = std::max(acc, output_activation_min);
114 acc = std::min(acc, output_activation_max);
115 output_data[Offset(output_shape, batch, out_y, out_x,
116 output_channel)] = static_cast<int8_t>(acc);
117 }
118 }
119 }
120 }
121 }
122 }
123
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int16_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const std::int64_t * bias_data,const RuntimeShape & output_shape,int16_t * output_data)124 inline void DepthwiseConvPerChannel(
125 const DepthwiseParams& params, const int32_t* output_multiplier,
126 const int32_t* output_shift, const RuntimeShape& input_shape,
127 const int16_t* input_data, const RuntimeShape& filter_shape,
128 const int8_t* filter_data, const RuntimeShape& bias_shape,
129 const std::int64_t* bias_data, const RuntimeShape& output_shape,
130 int16_t* output_data) {
131 // Get parameters.
132 const int stride_width = params.stride_width;
133 const int stride_height = params.stride_height;
134 const int dilation_width_factor = params.dilation_width_factor;
135 const int dilation_height_factor = params.dilation_height_factor;
136 const int pad_width = params.padding_values.width;
137 const int pad_height = params.padding_values.height;
138 const int depth_multiplier = params.depth_multiplier;
139 const int32_t output_activation_min = params.quantized_activation_min;
140 const int32_t output_activation_max = params.quantized_activation_max;
141
142 // Check dimensions of the tensors.
143 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
144 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
145 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
146
147 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
148 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
149 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
150 const int input_height = input_shape.Dims(1);
151 const int input_width = input_shape.Dims(2);
152 const int input_depth = input_shape.Dims(3);
153 const int filter_height = filter_shape.Dims(1);
154 const int filter_width = filter_shape.Dims(2);
155 const int output_height = output_shape.Dims(1);
156 const int output_width = output_shape.Dims(2);
157 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
158 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
159
160 for (int batch = 0; batch < batches; ++batch) {
161 for (int out_y = 0; out_y < output_height; ++out_y) {
162 for (int out_x = 0; out_x < output_width; ++out_x) {
163 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
164 for (int m = 0; m < depth_multiplier; ++m) {
165 const int output_channel = m + in_channel * depth_multiplier;
166 const int in_x_origin = (out_x * stride_width) - pad_width;
167 const int in_y_origin = (out_y * stride_height) - pad_height;
168 std::int64_t acc = 0;
169 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
170 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
171 const int in_x = in_x_origin + dilation_width_factor * filter_x;
172 const int in_y =
173 in_y_origin + dilation_height_factor * filter_y;
174 // Zero padding by omitting the areas outside the image.
175 const bool is_point_inside_image =
176 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
177 (in_y < input_height);
178 if (is_point_inside_image) {
179 int32_t input_val = input_data[Offset(
180 input_shape, batch, in_y, in_x, in_channel)];
181 int32_t filter_val = filter_data[Offset(
182 filter_shape, 0, filter_y, filter_x, output_channel)];
183 // Accumulate with 64 bits accumulator.
184 // We assume maximum of 2^16 accumulations as with the 8-bit
185 // case so actually the value in the accumulator should not
186 // exceed 40 bits
187 acc += static_cast<int64_t>(filter_val) *
188 static_cast<int64_t>(input_val);
189 }
190 }
191 }
192 if (bias_data) {
193 acc += bias_data[output_channel];
194 }
195 int32_t scaled_acc = MultiplyByQuantizedMultiplier(
196 acc, output_multiplier[output_channel],
197 output_shift[output_channel]);
198 scaled_acc = std::max(scaled_acc, output_activation_min);
199 scaled_acc = std::min(scaled_acc, output_activation_max);
200 output_data[Offset(output_shape, batch, out_y, out_x,
201 output_channel)] =
202 static_cast<int16_t>(scaled_acc);
203 }
204 }
205 }
206 }
207 }
208 }
209
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,float * scaling_factors_ptr,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scale,int32_t * input_offset)210 inline void DepthwiseConvHybridPerChannel(
211 const DepthwiseParams& params, float* scaling_factors_ptr,
212 const RuntimeShape& input_shape, const int8_t* input_data,
213 const RuntimeShape& filter_shape, const int8_t* filter_data,
214 const RuntimeShape& bias_shape, const float* bias_data,
215 const RuntimeShape& output_shape, float* output_data,
216 const float* per_channel_scale, int32_t* input_offset) {
217 const int stride_width = params.stride_width;
218 const int stride_height = params.stride_height;
219 const int dilation_width_factor = params.dilation_width_factor;
220 const int dilation_height_factor = params.dilation_height_factor;
221 const int pad_width = params.padding_values.width;
222 const int pad_height = params.padding_values.height;
223 const int depth_multiplier = params.depth_multiplier;
224 const float output_activation_min = params.float_activation_min;
225 const float output_activation_max = params.float_activation_max;
226 // Check dimensions of the tensors.
227 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
228 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
229 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
230
231 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
232 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
233 const int input_height = input_shape.Dims(1);
234 const int input_width = input_shape.Dims(2);
235 const int input_depth = input_shape.Dims(3);
236 const int filter_height = filter_shape.Dims(1);
237 const int filter_width = filter_shape.Dims(2);
238 const int output_height = output_shape.Dims(1);
239 const int output_width = output_shape.Dims(2);
240 const int bias_depth = bias_shape.FlatSize();
241 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
242 TFLITE_DCHECK_EQ(bias_depth, output_depth);
243
244 for (int batch = 0; batch < batches; ++batch) {
245 for (int out_y = 0; out_y < output_height; ++out_y) {
246 for (int out_x = 0; out_x < output_width; ++out_x) {
247 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
248 for (int m = 0; m < depth_multiplier; ++m) {
249 const int output_channel = m + in_channel * depth_multiplier;
250 const int in_x_origin = (out_x * stride_width) - pad_width;
251 const int in_y_origin = (out_y * stride_height) - pad_height;
252 int32_t acc = 0;
253 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
254 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
255 const int in_x = in_x_origin + dilation_width_factor * filter_x;
256 const int in_y =
257 in_y_origin + dilation_height_factor * filter_y;
258 // Zero padding by omitting the areas outside the image.
259 const bool is_point_inside_image =
260 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
261 (in_y < input_height);
262 if (is_point_inside_image) {
263 int32_t input_val = input_data[Offset(
264 input_shape, batch, in_y, in_x, in_channel)];
265 int32_t filter_val = filter_data[Offset(
266 filter_shape, 0, filter_y, filter_x, output_channel)];
267 acc += filter_val * (input_val - input_offset[batch]);
268 }
269 }
270 }
271 float acc_float = static_cast<float>(acc);
272 acc_float *=
273 per_channel_scale[output_channel] * scaling_factors_ptr[batch];
274 if (bias_data && output_channel < bias_depth) {
275 acc_float += bias_data[output_channel];
276 }
277 output_data[Offset(output_shape, batch, out_y, out_x,
278 output_channel)] =
279 ActivationFunctionWithMinMax(acc_float, output_activation_min,
280 output_activation_max);
281 }
282 }
283 }
284 }
285 }
286 }
287
288 } // namespace reference_integer_ops
289 } // namespace tflite
290
291 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
292