1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
17
18 #include <algorithm>
19
20 #include "ruy/profiler/instrumentation.h" // from @ruy
21 #include "tensorflow/lite/kernels/cpu_backend_context.h"
22 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
23 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
24 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
25 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
26 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
27 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
28 #include "tensorflow/lite/kernels/internal/types.h"
29
30 namespace tflite {
31 namespace optimized_integer_ops {
32 namespace depthwise_conv {
33
34 // Initializes the accumulator buffer with zeros.
DepthwiseConvInitAccBuffer(int num_output_pixels,int output_depth,int32 * acc_buffer)35 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
36 int32* acc_buffer) {
37 memset(acc_buffer, 0,
38 sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
39 }
40
41 // Initializes the accumulator buffer with bias values.
DepthwiseConvHybridGeneral(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)42 inline void DepthwiseConvHybridGeneral(
43 const DepthwiseParams& params,
44 const float* input_scales, const RuntimeShape& input_shape,
45 const int8* input_data, const RuntimeShape& filter_shape,
46 const int8* filter_data, const RuntimeShape& bias_shape,
47 const float* bias_data, const RuntimeShape& output_shape,
48 float* output_data, const float* per_channel_scales,
49 const int32_t* input_offsets, int thread_start, int thread_end,
50 int thread_dim) {
51 const int stride_width = params.stride_width;
52 const int stride_height = params.stride_height;
53 const int pad_width = params.padding_values.width;
54 const int pad_height = params.padding_values.height;
55 const int depth_multiplier = params.depth_multiplier;
56 const float output_activation_min = params.float_activation_min;
57 const float output_activation_max = params.float_activation_max;
58 const int dilation_width_factor = params.dilation_width_factor;
59 const int dilation_height_factor = params.dilation_height_factor;
60 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
61 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
62 const int input_height = input_shape.Dims(1);
63 const int input_width = input_shape.Dims(2);
64 const int input_depth = input_shape.Dims(3);
65 const int filter_height = filter_shape.Dims(1);
66 const int filter_width = filter_shape.Dims(2);
67 const int output_rows = output_shape.Dims(1);
68 const int output_width = output_shape.Dims(2);
69
70 static const int kAccBufferMaxSize = 2048;
71 int32 acc_buffer[kAccBufferMaxSize];
72 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
73 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
74 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
75 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
76 kAccBufferActualSize);
77 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
78 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
79 TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
80
81 // row_accum_func will point to the core accumulation function to be used
82 // for this DepthwiseConvHybrid op.
83 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
84 row_accum_func_t row_accum_func = nullptr;
85
86 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
87 FIXED_DEPTH_MULTIPLIER) \
88 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
89 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
90 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
91 row_accum_func = \
92 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
93 FIXED_DEPTH_MULTIPLIER>; \
94 }
95
96 #ifdef USE_NEON
97 // We go over our list of kernels by decreasing order of preference
98 // for the cases where multiple kernels could apply.
99
100 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
101
102 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
103 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
104 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
105 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
106 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
107 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
108 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
109 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
110 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
111 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
112
113 // Next come the strided kernels: AllowStrided=true, fixed input depth.
114 // They are a bit less efficient, but allow stride!=1.
115
116 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
117 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
118 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
119 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
120 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
121 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
122 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
123 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
124 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
125
126 // Finally, the kernels allowing a variable input depth,
127 // these are the least efficient but most general kernels.
128
129 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
130 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
131 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
132 #endif // USE_NEON
133
134 // No matching fast kernel found, use slow fallback.
135 if (!row_accum_func) {
136 row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
137 }
138
139 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
140
141 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
142 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
143 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
144
145 // Now that we have determined row_accum_func, we can start work.
146 int batch_start = 0;
147 int batch_end = batches;
148 int row_start = 0;
149 int row_end = output_rows;
150 int output_ptr_offset = 0;
151
152 switch (thread_dim) {
153 case 0:
154 TFLITE_DCHECK_GE(thread_start, 0);
155 TFLITE_DCHECK_LE(thread_end, batches);
156 batch_start = thread_start;
157 batch_end = thread_end;
158 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
159 break;
160 case 1:
161 TFLITE_DCHECK_GE(thread_start, 0);
162 TFLITE_DCHECK_LE(thread_end, output_rows);
163 row_start = thread_start;
164 row_end = thread_end;
165 output_ptr_offset = row_start * output_width * output_depth;
166 break;
167 }
168
169 float* output_ptr = output_data + output_ptr_offset;
170 int batch_step =
171 (output_rows + row_start - row_end) * output_width * output_depth;
172 for (int b = batch_start; b < batch_end; ++b) {
173 float input_scale = input_scales[b];
174 int32_t input_offset = input_offsets[b];
175 for (int out_y = row_start; out_y < row_end; ++out_y) {
176 const int in_y_origin = (out_y * stride_height) - pad_height;
177 const int filter_y_start =
178 std::max(0, (-in_y_origin + dilation_height_factor - 1) /
179 dilation_height_factor);
180 const int filter_y_end =
181 std::min(filter_height,
182 (input_height - in_y_origin + dilation_height_factor - 1) /
183 dilation_height_factor);
184 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
185 out_x_buffer_start += kOutputPixelsInAccBuffer) {
186 const int out_x_buffer_end = std::min(
187 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
188 // We call a 'pixel' a group of activation that share all but the
189 // 'depth'/'channel' coordinate. num_output_pixels is the number of
190 // output pixels that we will accumulate in this loop iteration.
191 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
192 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, acc_buffer);
193
194 // Accumulation loop. Most of the time should be spent in here.
195 for (int filter_y = filter_y_start; filter_y < filter_y_end;
196 ++filter_y) {
197 const int in_y = in_y_origin + dilation_height_factor * filter_y;
198 row_accum_func(
199 stride_width, dilation_width_factor, input_depth, input_width,
200 input_data + in_y * input_height_stride + b * input_batch_stride,
201 -input_offset, pad_width, depth_multiplier, filter_width,
202 filter_data + filter_y * filter_height_stride, out_x_buffer_start,
203 out_x_buffer_end, output_depth, acc_buffer);
204 }
205 // Finished accumulating int32 values. Just store them as float values
206 gemmlowp::ScopedProfilingLabel label("store");
207 const int num_output_values = output_depth * num_output_pixels;
208 int c = 0;
209 while (c < output_depth) {
210 int target_output_depth = output_depth;
211
212 #ifdef USE_NEON
213 const float32x4_t output_activation_min_vec =
214 vdupq_n_f32(output_activation_min);
215 const float32x4_t output_activation_max_vec =
216 vdupq_n_f32(output_activation_max);
217 const float32x4_t input_scale_32x4 = vdupq_n_f32(input_scale);
218 for (; c <= output_depth - 4; c += 4) {
219 if ((c + 4) > output_depth) {
220 break;
221 }
222 const float32x4_t channel_scale_32x4 =
223 vld1q_f32(per_channel_scales + c);
224 const float32x4_t bias_32x4 = vld1q_f32(bias_data + c);
225 for (int n = 0; n < num_output_pixels; ++n) {
226 int loc = n * output_depth + c;
227 int32x4_t acc = vld1q_s32(acc_buffer + loc);
228 float32x4_t float_acc = vcvtq_f32_s32(acc);
229 float_acc = vmulq_f32(float_acc, channel_scale_32x4);
230 float_acc = vmulq_f32(float_acc, input_scale_32x4);
231 float_acc = vaddq_f32(float_acc, bias_32x4);
232 float_acc = vmaxq_f32(float_acc, output_activation_min_vec);
233 float_acc = vminq_f32(float_acc, output_activation_max_vec);
234 vst1q_f32(output_ptr + loc, float_acc);
235 }
236 }
237 #endif // USE_NEON
238
239 for (; c < target_output_depth; c++) {
240 for (int n = 0; n < num_output_pixels; ++n) {
241 int loc = n * output_depth + c;
242 int32 acc = acc_buffer[loc];
243 float float_acc = acc * input_scale * per_channel_scales[c];
244 float_acc += bias_data[c];
245 float_acc = std::max(float_acc, output_activation_min);
246 float_acc = std::min(float_acc, output_activation_max);
247 output_ptr[loc] = float_acc;
248 }
249 }
250 }
251 output_ptr += num_output_values;
252 }
253 }
254 output_ptr += batch_step;
255 }
256 }
257
258 } // namespace depthwise_conv
259
260 template <DepthwiseConvOutputRounding kOutputRounding>
DepthwiseConvHybridWithRounding(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)261 inline void DepthwiseConvHybridWithRounding(
262 const DepthwiseParams& params, const float* input_scales,
263 const RuntimeShape& input_shape, const int8* input_data,
264 const RuntimeShape& filter_shape, const int8* filter_data,
265 const RuntimeShape& bias_shape, const float* bias_data,
266 const RuntimeShape& output_shape, float* output_data,
267 const float* per_channel_scales, const int32_t* input_offsets,
268 int thread_start, int thread_end, int thread_dim) {
269 gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8/8bit");
270 const int depth_multiplier = params.depth_multiplier;
271 const int dilation_width_factor = params.dilation_width_factor;
272 const int dilation_height_factor = params.dilation_height_factor;
273 TFLITE_DCHECK_GE(dilation_width_factor, 1);
274 TFLITE_DCHECK_GE(dilation_height_factor, 1);
275 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
276 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
277 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
278 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
279 const int input_depth = input_shape.Dims(3);
280 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
281 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
282
283 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
284 // Jetson TX-2. This compiler does not support the offsetof() macro.
285
286 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
287 const int stride_width = params.stride_width;
288 const int stride_height = params.stride_height;
289 const int pad_width = params.padding_values.width;
290 const int pad_height = params.padding_values.height;
291
292 // Call kernel optimized for depthwise convolutions using 3x3 filters if
293 // parameters are supported.
294 if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
295 optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
296 input_shape, filter_shape, stride_width, stride_height,
297 dilation_width_factor, dilation_height_factor, pad_width, pad_height,
298 depth_multiplier, output_shape, 0, nullptr)) {
299 gemmlowp::ScopedProfilingLabel specialized_label(
300 "DepthwiseConvHybridInt8/8bit/3x3");
301 optimized_ops::depthwise_conv::DepthwiseConvHybrid3x3FilterPerChannel<
302 DepthwiseConvOutputRounding::kUpward>(
303 params, input_scales, input_shape, input_data,
304 filter_shape, filter_data, bias_shape, bias_data, output_shape,
305 output_data, per_channel_scales, input_offsets,
306 thread_start, thread_end, thread_dim);
307 return;
308 }
309 #endif
310
311 gemmlowp::ScopedProfilingLabel specialized_label(
312 "DepthwiseConvHybridInt8/8bit/General");
313 depthwise_conv::DepthwiseConvHybridGeneral(
314 params, input_scales, input_shape, input_data,
315 filter_shape, filter_data, bias_shape, bias_data, output_shape,
316 output_data, per_channel_scales, input_offsets,
317 thread_start, thread_end, thread_dim);
318 }
319
DepthwiseConvHybridImpl(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)320 inline void DepthwiseConvHybridImpl(
321 const DepthwiseParams& params, const float* input_scales,
322 const RuntimeShape& input_shape, const int8* input_data,
323 const RuntimeShape& filter_shape, const int8* filter_data,
324 const RuntimeShape& bias_shape, const float* bias_data,
325 const RuntimeShape& output_shape, float* output_data,
326 const float* per_channel_scales, const int32_t* input_offsets,
327 int thread_start, int thread_end, int thread_dim) {
328 return DepthwiseConvHybridWithRounding<
329 DepthwiseConvOutputRounding::kAwayFromZero>(
330 params, input_scales, input_shape, input_data,
331 filter_shape, filter_data, bias_shape, bias_data, output_shape,
332 output_data, per_channel_scales, input_offsets,
333 thread_start, thread_end, thread_dim);
334 }
335
336 template <typename T, typename TS>
337 struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvHybridWorkerTaskDepthwiseConvHybridWorkerTask338 DepthwiseConvHybridWorkerTask(const DepthwiseParams& params,
339 const float* input_scales,
340 const RuntimeShape& input_shape,
341 const T* input_data,
342 const RuntimeShape& filter_shape,
343 const T* filter_data,
344 const RuntimeShape& bias_shape,
345 const TS* bias_data,
346 const RuntimeShape& output_shape,
347 float* output_data,
348 const float* per_channel_scales,
349 const int32_t* input_offsets,
350 int thread_start, int thread_end,
351 int thread_dim)
352 : params(params),
353 input_scales(input_scales),
354 input_shape(input_shape),
355 input_data(input_data),
356 filter_shape(filter_shape),
357 filter_data(filter_data),
358 bias_shape(bias_shape),
359 bias_data(bias_data),
360 output_shape(output_shape),
361 output_data(output_data),
362 per_channel_scales(per_channel_scales),
363 input_offsets(input_offsets),
364 thread_start(thread_start),
365 thread_end(thread_end),
366 thread_dim(thread_dim) {}
367
RunDepthwiseConvHybridWorkerTask368 void Run() override {
369 DepthwiseConvHybridImpl(params, input_scales, input_shape,
370 input_data, filter_shape, filter_data,
371 bias_shape, bias_data, output_shape,
372 output_data, per_channel_scales, input_offsets,
373 thread_start, thread_end, thread_dim);
374 }
375
376 private:
377 const DepthwiseParams& params;
378 const float* input_scales;
379 const RuntimeShape& input_shape;
380 const T* input_data;
381 const RuntimeShape& filter_shape;
382 const T* filter_data;
383 const RuntimeShape& bias_shape;
384 const TS* bias_data;
385 const RuntimeShape& output_shape;
386 float* output_data;
387 const float* per_channel_scales;
388 const int32_t* input_offsets;
389 int thread_start;
390 int thread_end;
391 int thread_dim;
392 };
393
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,int32_t * input_offsets,CpuBackendContext * cpu_backend_context)394 inline void DepthwiseConvHybridPerChannel(
395 const DepthwiseParams& params, const float* input_scales,
396 const RuntimeShape& input_shape, const int8* input_data,
397 const RuntimeShape& filter_shape, const int8* filter_data,
398 const RuntimeShape& bias_shape, const float* bias_data,
399 const RuntimeShape& output_shape, float* output_data,
400 const float* per_channel_scales, int32_t* input_offsets,
401 CpuBackendContext* cpu_backend_context) {
402 gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8");
403 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
404 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
405 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
406
407 const int output_batches = output_shape.Dims(0);
408 const int output_rows = output_shape.Dims(1);
409 int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
410 int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
411 int thread_dim, thread_count, thread_dim_size;
412 if (thread_count_batch > thread_count_row) {
413 thread_dim = 0;
414 thread_dim_size = output_batches;
415 thread_count = thread_count_batch;
416 } else {
417 thread_dim = 1;
418 thread_dim_size = output_rows;
419 thread_count = thread_count_row;
420 }
421
422 const int max_threads = cpu_backend_context->max_num_threads();
423 thread_count = std::max(1, std::min(thread_count, max_threads));
424
425 if (thread_count == 1) {
426 DepthwiseConvHybridImpl(params, input_scales, input_shape,
427 input_data, filter_shape, filter_data, bias_shape,
428 bias_data, output_shape, output_data,
429 per_channel_scales, input_offsets,
430 /*thread_start=*/0, /*thread_end=*/output_rows,
431 /*thread_dim=*/1);
432 } else {
433 std::vector<DepthwiseConvHybridWorkerTask<int8, float>> tasks;
434 // TODO(b/131746020) don't create new heap allocations every time.
435 // At least we make it a single heap allocation by using reserve().
436 tasks.reserve(thread_count);
437 int thread_start = 0;
438 for (int i = 0; i < thread_count; ++i) {
439 int thread_end =
440 thread_start + (thread_dim_size - thread_start) / (thread_count - i);
441 tasks.emplace_back(params, input_scales, input_shape,
442 input_data, filter_shape, filter_data, bias_shape,
443 bias_data, output_shape, output_data,
444 per_channel_scales, input_offsets, thread_start,
445 thread_end, thread_dim);
446 thread_start = thread_end;
447 }
448 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
449 cpu_backend_context);
450 }
451 }
452
453 } // namespace optimized_integer_ops
454 } // namespace tflite
455
456 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
457