1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
17 
18 #include <algorithm>
19 
20 #include "ruy/profiler/instrumentation.h"  // from @ruy
21 #include "tensorflow/lite/kernels/cpu_backend_context.h"
22 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
23 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
24 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
25 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
26 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
27 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
28 #include "tensorflow/lite/kernels/internal/types.h"
29 
30 namespace tflite {
31 namespace optimized_integer_ops {
32 namespace depthwise_conv {
33 
34 // Initializes the accumulator buffer with zeros.
DepthwiseConvInitAccBuffer(int num_output_pixels,int output_depth,int32 * acc_buffer)35 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
36                                        int32* acc_buffer) {
37   memset(acc_buffer, 0,
38          sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
39 }
40 
41 // Initializes the accumulator buffer with bias values.
DepthwiseConvHybridGeneral(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)42 inline void DepthwiseConvHybridGeneral(
43     const DepthwiseParams& params,
44     const float* input_scales, const RuntimeShape& input_shape,
45     const int8* input_data, const RuntimeShape& filter_shape,
46     const int8* filter_data, const RuntimeShape& bias_shape,
47     const float* bias_data, const RuntimeShape& output_shape,
48     float* output_data, const float* per_channel_scales,
49     const int32_t* input_offsets, int thread_start, int thread_end,
50     int thread_dim) {
51   const int stride_width = params.stride_width;
52   const int stride_height = params.stride_height;
53   const int pad_width = params.padding_values.width;
54   const int pad_height = params.padding_values.height;
55   const int depth_multiplier = params.depth_multiplier;
56   const float output_activation_min = params.float_activation_min;
57   const float output_activation_max = params.float_activation_max;
58   const int dilation_width_factor = params.dilation_width_factor;
59   const int dilation_height_factor = params.dilation_height_factor;
60   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
61   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
62   const int input_height = input_shape.Dims(1);
63   const int input_width = input_shape.Dims(2);
64   const int input_depth = input_shape.Dims(3);
65   const int filter_height = filter_shape.Dims(1);
66   const int filter_width = filter_shape.Dims(2);
67   const int output_rows = output_shape.Dims(1);
68   const int output_width = output_shape.Dims(2);
69 
70   static const int kAccBufferMaxSize = 2048;
71   int32 acc_buffer[kAccBufferMaxSize];
72   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
73   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
74   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
75   TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
76                    kAccBufferActualSize);
77   TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
78   TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
79   TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
80 
81   // row_accum_func will point to the core accumulation function to be used
82   // for this DepthwiseConvHybrid op.
83   using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
84   row_accum_func_t row_accum_func = nullptr;
85 
86 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
87                                         FIXED_DEPTH_MULTIPLIER)           \
88   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
89       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
90       depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
91     row_accum_func =                                                      \
92         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
93                                        FIXED_DEPTH_MULTIPLIER>;           \
94   }
95 
96 #ifdef USE_NEON
97   // We go over our list of kernels by decreasing order of preference
98   // for the cases where multiple kernels could apply.
99 
100   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
101 
102   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
103   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
104   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
105   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
106   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
107   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
108   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
109   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
110   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
111   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
112 
113   // Next come the strided kernels: AllowStrided=true, fixed input depth.
114   // They are a bit less efficient, but allow stride!=1.
115 
116   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
117   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
118   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
119   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
120   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
121   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
122   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
123   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
124   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
125 
126   // Finally, the kernels allowing a variable input depth,
127   // these are the least efficient but most general kernels.
128 
129   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
130   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
131   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
132 #endif  // USE_NEON
133 
134   // No matching fast kernel found, use slow fallback.
135   if (!row_accum_func) {
136     row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
137   }
138 
139 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
140 
141   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
142   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
143   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
144 
145   // Now that we have determined row_accum_func, we can start work.
146   int batch_start = 0;
147   int batch_end = batches;
148   int row_start = 0;
149   int row_end = output_rows;
150   int output_ptr_offset = 0;
151 
152   switch (thread_dim) {
153     case 0:
154       TFLITE_DCHECK_GE(thread_start, 0);
155       TFLITE_DCHECK_LE(thread_end, batches);
156       batch_start = thread_start;
157       batch_end = thread_end;
158       output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
159       break;
160     case 1:
161       TFLITE_DCHECK_GE(thread_start, 0);
162       TFLITE_DCHECK_LE(thread_end, output_rows);
163       row_start = thread_start;
164       row_end = thread_end;
165       output_ptr_offset = row_start * output_width * output_depth;
166       break;
167   }
168 
169   float* output_ptr = output_data + output_ptr_offset;
170   int batch_step =
171       (output_rows + row_start - row_end) * output_width * output_depth;
172   for (int b = batch_start; b < batch_end; ++b) {
173     float input_scale = input_scales[b];
174     int32_t input_offset = input_offsets[b];
175     for (int out_y = row_start; out_y < row_end; ++out_y) {
176       const int in_y_origin = (out_y * stride_height) - pad_height;
177       const int filter_y_start =
178           std::max(0, (-in_y_origin + dilation_height_factor - 1) /
179                           dilation_height_factor);
180       const int filter_y_end =
181           std::min(filter_height,
182                    (input_height - in_y_origin + dilation_height_factor - 1) /
183                        dilation_height_factor);
184       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
185            out_x_buffer_start += kOutputPixelsInAccBuffer) {
186         const int out_x_buffer_end = std::min(
187             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
188         // We call a 'pixel' a group of activation that share all but the
189         // 'depth'/'channel' coordinate. num_output_pixels is the number of
190         // output pixels that we will accumulate in this loop iteration.
191         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
192         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, acc_buffer);
193 
194         // Accumulation loop. Most of the time should be spent in here.
195         for (int filter_y = filter_y_start; filter_y < filter_y_end;
196              ++filter_y) {
197           const int in_y = in_y_origin + dilation_height_factor * filter_y;
198           row_accum_func(
199               stride_width, dilation_width_factor, input_depth, input_width,
200               input_data + in_y * input_height_stride + b * input_batch_stride,
201               -input_offset, pad_width, depth_multiplier, filter_width,
202               filter_data + filter_y * filter_height_stride, out_x_buffer_start,
203               out_x_buffer_end, output_depth, acc_buffer);
204         }
205         // Finished accumulating int32 values. Just store them as float values
206         gemmlowp::ScopedProfilingLabel label("store");
207         const int num_output_values = output_depth * num_output_pixels;
208         int c = 0;
209         while (c < output_depth) {
210           int target_output_depth = output_depth;
211 
212 #ifdef USE_NEON
213           const float32x4_t output_activation_min_vec =
214               vdupq_n_f32(output_activation_min);
215           const float32x4_t output_activation_max_vec =
216               vdupq_n_f32(output_activation_max);
217           const float32x4_t input_scale_32x4 = vdupq_n_f32(input_scale);
218           for (; c <= output_depth - 4; c += 4) {
219             if ((c + 4) > output_depth) {
220               break;
221             }
222             const float32x4_t channel_scale_32x4 =
223                 vld1q_f32(per_channel_scales + c);
224             const float32x4_t bias_32x4 = vld1q_f32(bias_data + c);
225             for (int n = 0; n < num_output_pixels; ++n) {
226               int loc = n * output_depth + c;
227               int32x4_t acc = vld1q_s32(acc_buffer + loc);
228               float32x4_t float_acc = vcvtq_f32_s32(acc);
229               float_acc = vmulq_f32(float_acc, channel_scale_32x4);
230               float_acc = vmulq_f32(float_acc, input_scale_32x4);
231               float_acc = vaddq_f32(float_acc, bias_32x4);
232               float_acc = vmaxq_f32(float_acc, output_activation_min_vec);
233               float_acc = vminq_f32(float_acc, output_activation_max_vec);
234               vst1q_f32(output_ptr + loc, float_acc);
235             }
236           }
237 #endif  // USE_NEON
238 
239           for (; c < target_output_depth; c++) {
240             for (int n = 0; n < num_output_pixels; ++n) {
241               int loc = n * output_depth + c;
242               int32 acc = acc_buffer[loc];
243               float float_acc = acc * input_scale * per_channel_scales[c];
244               float_acc += bias_data[c];
245               float_acc = std::max(float_acc, output_activation_min);
246               float_acc = std::min(float_acc, output_activation_max);
247               output_ptr[loc] = float_acc;
248             }
249           }
250         }
251         output_ptr += num_output_values;
252       }
253     }
254     output_ptr += batch_step;
255   }
256 }
257 
258 }  // namespace depthwise_conv
259 
260 template <DepthwiseConvOutputRounding kOutputRounding>
DepthwiseConvHybridWithRounding(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)261 inline void DepthwiseConvHybridWithRounding(
262     const DepthwiseParams& params, const float* input_scales,
263     const RuntimeShape& input_shape, const int8* input_data,
264     const RuntimeShape& filter_shape, const int8* filter_data,
265     const RuntimeShape& bias_shape, const float* bias_data,
266     const RuntimeShape& output_shape, float* output_data,
267     const float* per_channel_scales, const int32_t* input_offsets,
268     int thread_start, int thread_end, int thread_dim) {
269   gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8/8bit");
270   const int depth_multiplier = params.depth_multiplier;
271   const int dilation_width_factor = params.dilation_width_factor;
272   const int dilation_height_factor = params.dilation_height_factor;
273   TFLITE_DCHECK_GE(dilation_width_factor, 1);
274   TFLITE_DCHECK_GE(dilation_height_factor, 1);
275   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
276   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
277   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
278   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
279   const int input_depth = input_shape.Dims(3);
280   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
281   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
282 
283 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
284 // Jetson TX-2. This compiler does not support the offsetof() macro.
285 
286 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
287   const int stride_width = params.stride_width;
288   const int stride_height = params.stride_height;
289   const int pad_width = params.padding_values.width;
290   const int pad_height = params.padding_values.height;
291 
292   // Call kernel optimized for depthwise convolutions using 3x3 filters if
293   // parameters are supported.
294   if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
295       optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
296           input_shape, filter_shape, stride_width, stride_height,
297           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
298           depth_multiplier, output_shape, 0, nullptr)) {
299     gemmlowp::ScopedProfilingLabel specialized_label(
300         "DepthwiseConvHybridInt8/8bit/3x3");
301     optimized_ops::depthwise_conv::DepthwiseConvHybrid3x3FilterPerChannel<
302         DepthwiseConvOutputRounding::kUpward>(
303             params, input_scales, input_shape, input_data,
304             filter_shape, filter_data, bias_shape, bias_data, output_shape,
305             output_data, per_channel_scales, input_offsets,
306             thread_start, thread_end, thread_dim);
307     return;
308   }
309 #endif
310 
311   gemmlowp::ScopedProfilingLabel specialized_label(
312       "DepthwiseConvHybridInt8/8bit/General");
313   depthwise_conv::DepthwiseConvHybridGeneral(
314       params, input_scales, input_shape, input_data,
315       filter_shape, filter_data, bias_shape, bias_data, output_shape,
316       output_data, per_channel_scales, input_offsets,
317       thread_start, thread_end, thread_dim);
318 }
319 
DepthwiseConvHybridImpl(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)320 inline void DepthwiseConvHybridImpl(
321     const DepthwiseParams& params, const float* input_scales,
322     const RuntimeShape& input_shape, const int8* input_data,
323     const RuntimeShape& filter_shape, const int8* filter_data,
324     const RuntimeShape& bias_shape, const float* bias_data,
325     const RuntimeShape& output_shape, float* output_data,
326     const float* per_channel_scales, const int32_t* input_offsets,
327     int thread_start, int thread_end, int thread_dim) {
328   return DepthwiseConvHybridWithRounding<
329       DepthwiseConvOutputRounding::kAwayFromZero>(
330           params, input_scales, input_shape, input_data,
331           filter_shape, filter_data, bias_shape, bias_data, output_shape,
332           output_data, per_channel_scales, input_offsets,
333           thread_start, thread_end, thread_dim);
334 }
335 
336 template <typename T, typename TS>
337 struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvHybridWorkerTaskDepthwiseConvHybridWorkerTask338   DepthwiseConvHybridWorkerTask(const DepthwiseParams& params,
339                                 const float* input_scales,
340                                 const RuntimeShape& input_shape,
341                                 const T* input_data,
342                                 const RuntimeShape& filter_shape,
343                                 const T* filter_data,
344                                 const RuntimeShape& bias_shape,
345                                 const TS* bias_data,
346                                 const RuntimeShape& output_shape,
347                                 float* output_data,
348                                 const float* per_channel_scales,
349                                 const int32_t* input_offsets,
350                                 int thread_start, int thread_end,
351                                 int thread_dim)
352       : params(params),
353         input_scales(input_scales),
354         input_shape(input_shape),
355         input_data(input_data),
356         filter_shape(filter_shape),
357         filter_data(filter_data),
358         bias_shape(bias_shape),
359         bias_data(bias_data),
360         output_shape(output_shape),
361         output_data(output_data),
362         per_channel_scales(per_channel_scales),
363         input_offsets(input_offsets),
364         thread_start(thread_start),
365         thread_end(thread_end),
366         thread_dim(thread_dim) {}
367 
RunDepthwiseConvHybridWorkerTask368   void Run() override {
369     DepthwiseConvHybridImpl(params, input_scales, input_shape,
370                             input_data, filter_shape, filter_data,
371                             bias_shape, bias_data, output_shape,
372                             output_data, per_channel_scales, input_offsets,
373                             thread_start, thread_end, thread_dim);
374   }
375 
376  private:
377   const DepthwiseParams& params;
378   const float* input_scales;
379   const RuntimeShape& input_shape;
380   const T* input_data;
381   const RuntimeShape& filter_shape;
382   const T* filter_data;
383   const RuntimeShape& bias_shape;
384   const TS* bias_data;
385   const RuntimeShape& output_shape;
386   float* output_data;
387   const float* per_channel_scales;
388   const int32_t* input_offsets;
389   int thread_start;
390   int thread_end;
391   int thread_dim;
392 };
393 
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,int32_t * input_offsets,CpuBackendContext * cpu_backend_context)394 inline void DepthwiseConvHybridPerChannel(
395     const DepthwiseParams& params, const float* input_scales,
396     const RuntimeShape& input_shape, const int8* input_data,
397     const RuntimeShape& filter_shape, const int8* filter_data,
398     const RuntimeShape& bias_shape, const float* bias_data,
399     const RuntimeShape& output_shape, float* output_data,
400     const float* per_channel_scales, int32_t* input_offsets,
401     CpuBackendContext* cpu_backend_context) {
402   gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8");
403   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
404   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
405   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
406 
407   const int output_batches = output_shape.Dims(0);
408   const int output_rows = output_shape.Dims(1);
409   int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
410   int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
411   int thread_dim, thread_count, thread_dim_size;
412   if (thread_count_batch > thread_count_row) {
413     thread_dim = 0;
414     thread_dim_size = output_batches;
415     thread_count = thread_count_batch;
416   } else {
417     thread_dim = 1;
418     thread_dim_size = output_rows;
419     thread_count = thread_count_row;
420   }
421 
422   const int max_threads = cpu_backend_context->max_num_threads();
423   thread_count = std::max(1, std::min(thread_count, max_threads));
424 
425   if (thread_count == 1) {
426     DepthwiseConvHybridImpl(params, input_scales, input_shape,
427                             input_data, filter_shape, filter_data, bias_shape,
428                             bias_data, output_shape, output_data,
429                             per_channel_scales, input_offsets,
430                             /*thread_start=*/0, /*thread_end=*/output_rows,
431                             /*thread_dim=*/1);
432   } else {
433     std::vector<DepthwiseConvHybridWorkerTask<int8, float>> tasks;
434     // TODO(b/131746020) don't create new heap allocations every time.
435     // At least we make it a single heap allocation by using reserve().
436     tasks.reserve(thread_count);
437     int thread_start = 0;
438     for (int i = 0; i < thread_count; ++i) {
439       int thread_end =
440           thread_start + (thread_dim_size - thread_start) / (thread_count - i);
441       tasks.emplace_back(params, input_scales, input_shape,
442                          input_data, filter_shape, filter_data, bias_shape,
443                          bias_data, output_shape, output_data,
444                          per_channel_scales, input_offsets, thread_start,
445                          thread_end, thread_dim);
446       thread_start = thread_end;
447     }
448     cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
449                                     cpu_backend_context);
450   }
451 }
452 
453 }  // namespace optimized_integer_ops
454 }  // namespace tflite
455 
456 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
457