1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
17 
18 // This file provides kernel implementations that are not used in shipped
19 // inference code, but rather (a) show how model C++ code is designed and then
20 // transformed into asm code, and (b) aid with maintenance and later development
21 // of variations. Many projects (even including, say, the classic NAG libraries)
22 // develop highly optimized code, but do not maintain intermediate versions.
23 // Often the result is incomprehensible final-version code.
24 
25 #include <algorithm>
26 
27 #include "tensorflow/lite/kernels/internal/compatibility.h"
28 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
31 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33 
34 namespace tflite {
35 namespace optimized_ops {
36 namespace depthwise_conv {
37 
38 #ifdef USE_NEON
39 
util_vst1_u8(uint8 * data_addr,uint8x8_t reg)40 inline void util_vst1_u8(uint8* data_addr, uint8x8_t reg) {
41   return vst1_u8(data_addr, reg);
42 }
util_vst1_x8(uint8 * data_addr,int8x8_t reg)43 inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
44   return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
45 }
util_vst1_x8(int8 * data_addr,int8x8_t reg)46 inline void util_vst1_x8(int8* data_addr, int8x8_t reg) {
47   return vst1_s8(data_addr, reg);
48 }
49 
50 // Lane operations are for clarity and convenience. We want to load and store
51 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
52 // 32-bit stores. Stores require 32-bit alignment.
53 
54 #define vst1_lane_s8x4(dst, reg, lane_num)                                  \
55   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);           \
56   vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_s8(reg), \
57                 lane_num)
58 #define vst1_lane_u8x4(dst, reg, lane_num)                                  \
59   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);           \
60   vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_u8(reg), \
61                 lane_num)
62 #define vst1q_lane_s8x4(dst, reg, lane_num)                                   \
63   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);             \
64   vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_s8(reg), \
65                  lane_num)
66 #define vst1q_lane_u8x4(dst, reg, lane_num)                                   \
67   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);             \
68   vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_u8(reg), \
69                  lane_num)
70 
71 // Important! Most compilation configurations will compile and run without
72 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
73 // obscure bug or mis-feature probably in unhygienic macro expansion.
74 #define vld1q_lane_s8x8(src, reg, lane_num)                                   \
75   vreinterpretq_s8_u64(vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), \
76                                       vreinterpretq_u64_s8(reg), lane_num))
77 #define vld1_lane_8x4(src, reg, lane_num)                                \
78   vreinterpret_s8_s32(vld1_lane_s32(reinterpret_cast<const int32*>(src), \
79                                     vreinterpret_s32_s8(reg), lane_num))
80 #define vld1q_lane_8x4(src, reg, lane_num)                                 \
81   vreinterpretq_s8_s32(vld1q_lane_s32(reinterpret_cast<const int32*>(src), \
82                                       vreinterpretq_s32_s8(reg), lane_num))
83 #define vld1q_dup_s8x4(src) \
84   vreinterpretq_s8_s32(vld1q_dup_s32(reinterpret_cast<const int32*>(src)))
85 
86 #endif  // USE_NEON
87 
88 template <QuantizationType quantization_type>
89 struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
90                        quantization_type> {
91   // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
92   // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
93   // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
94   //
95   // Note that this rearrangement is much like that performed on input data when
96   // filling the workspace, and optimized versions will be similar.
97   static inline void FillFilterBank(int depth, const uint8* filter_block,
98                                     int8 filter_bank[3][2][4][4]) {
99     constexpr int kSymmetricZeroPoint =
100         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
101     // Load filter data in, 8-bytes down depth / sub-block at a time.
102     //
103     // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
104     // depth 4.
105     uint8 loaded_filter[3][4][2][4];
106     for (int y = 0; y < 3; ++y) {
107       for (int x = 0; x < 3; ++x) {
108         memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
109                8);
110       }
111       // Pad the filter with symmetric representation of 0, so that the values
112       // become 0 when the zero-poing is added below. Thus these filter taps are
113       // effectively disregarded in later filtering.
114       memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
115     }
116     for (int y = 0; y < 3; ++y) {
117       for (int z = 0; z < 4; ++z) {
118         for (int x = 0; x < 4; ++x) {
119           filter_bank[y][0][z][x] =
120               loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
121           filter_bank[y][1][z][x] =
122               loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
123         }
124       }
125     }
126   }
127 
128   // Adjust the bias (weights) data according to the input offset.
129   //
130   // The output calculation is
131   // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
132   //                                 (filter[i][j][d] + filter_offset)
133   // (where offsets are expressed as differences from 128).
134   //
135   // Since we cannot efficiently handle varying offsets / bias across the image,
136   // we insist on filter_offset = 0.
137   //
138   // This function calculates
139   // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
140   // which accounts for input offset. If the bias is constant over the depth,
141   // the adjusted bias will vary.
142   static inline void AdjustBias(int32 input_offset,
143                                 const int8 filter_bank[3][2][4][4],
144                                 const int32* bias_data,
145                                 int32 adjusted_bias_block[2][4]) {
146     constexpr int kSymmetricZeroPoint =
147         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
148     TFLITE_DCHECK_GE(input_offset, -255);
149     TFLITE_DCHECK_LE(input_offset, 0);
150     // For instance, if input_offset == 128, no adjustment is needed.
151     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
152 
153     for (int s = 0; s < 2; ++s) {
154       for (int z = 0; z < 4; ++z) {
155         adjusted_bias_block[s][z] = bias_data[4 * s + z];
156         for (int i = 0; i < 9; ++i) {
157           adjusted_bias_block[s][z] +=
158               input_offset_difference * filter_bank[i % 3][s][z][i / 3];
159         }
160       }
161     }
162   }
163 
164   static void Run(const uint8* filter_data, const int32* bias_data,
165                   int8* shuffled_filter_data, int32* adjusted_bias_data,
166                   const DepthwiseConvDotProdParams* function_params) {
167     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
168     const int depth = function_params->output_depth;
169     const int depth_micro_repeats = function_params->depth_micro_repeats;
170     const int bias_increment = function_params->bias_increment;
171     const int32 input_offset = function_params->input_offset;
172 
173     int8 filter_bank[3][2][4][4];
174     int32 adjusted_bias_block[2][4];
175 
176     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
177       FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
178       AdjustBias(input_offset, filter_bank,
179                  bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
180 
181       memcpy(shuffled_filter_data, filter_bank[0][0][0],
182              shuffled_filter_increment);
183       shuffled_filter_data += shuffled_filter_increment;
184       memcpy(adjusted_bias_data, adjusted_bias_block[0],
185              8 * sizeof(adjusted_bias_block[0][0]));
186       adjusted_bias_data += 8;
187     }
188   }
189 };
190 
191 template <QuantizationType quantization_type>
192 struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
193                        quantization_type> {
194   static inline void Run(const uint8* filter_data, const int32* bias_data,
195                          int8* shuffled_filter_data, int32* adjusted_bias_data,
196                          const DepthwiseConvDotProdParams* function_params) {
197     const int depth = function_params->output_depth;
198     const int depth_micro_repeats = function_params->depth_micro_repeats;
199     const int bias_increment = function_params->bias_increment;
200 
201     // Simulate NEON-register transposition of subset of filter.
202     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
203     int8 filter_bank_a_1[4][4];
204     int8 filter_bank_a_2[4][4];
205     int8 filter_bank_b_0[4][4];
206     int8 filter_bank_b_1[4][4];
207     int8 filter_bank_b_2[4][4];
208 
209     // Load filter data in, essentially dropping the [depth/8] dimension, which
210     // is equivalent to loading just the depth needed for one micro-block.
211     //
212     // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
213     // depth 4.
214     uint8 loaded_filter_0[4][2][4];
215     uint8 loaded_filter_1[4][2][4];
216     uint8 loaded_filter_2[4][2][4];
217 
218     constexpr int kSymmetricZeroPoint =
219         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
220     const int32 input_offset = function_params->input_offset;
221     TFLITE_DCHECK_GE(input_offset, -255);
222     TFLITE_DCHECK_LE(input_offset, 0);
223     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
224 
225     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
226       const uint8* filter_block = filter_data + 8 * j_depth;
227 
228       // Filter data is provided as filter_block[3][3][depth/8][2][4].
229       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
230       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
231       for (int x = 0; x < 3; ++x) {
232         memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
233                8);
234         memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
235                8);
236         memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
237                8);
238       }
239       // Pad the filter with -filter_offset, so that the values become 0 when
240       // the filter_offset is later added, and so the filter tap is effectively
241       // disregarded.
242       memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
243       memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
244       memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
245 
246       for (int z = 0; z < 4; ++z) {
247         for (int x = 0; x < 4; ++x) {
248           filter_bank_a_0[z][x] =
249               loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
250           filter_bank_b_0[z][x] =
251               loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
252           filter_bank_a_1[z][x] =
253               loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
254           filter_bank_b_1[z][x] =
255               loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
256           filter_bank_a_2[z][x] =
257               loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
258           filter_bank_b_2[z][x] =
259               loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
260         }
261       }
262 
263       memcpy(shuffled_filter_data, filter_bank_a_0, 16);
264       shuffled_filter_data += 16;
265       memcpy(shuffled_filter_data, filter_bank_b_0, 16);
266       shuffled_filter_data += 16;
267       memcpy(shuffled_filter_data, filter_bank_a_1, 16);
268       shuffled_filter_data += 16;
269       memcpy(shuffled_filter_data, filter_bank_b_1, 16);
270       shuffled_filter_data += 16;
271       memcpy(shuffled_filter_data, filter_bank_a_2, 16);
272       shuffled_filter_data += 16;
273       memcpy(shuffled_filter_data, filter_bank_b_2, 16);
274       shuffled_filter_data += 16;
275 
276       int32 adjusted_bias_data_0[4];
277       int32 adjusted_bias_data_1[4];
278       // For instance, if input_offset == 128, no adjustment is needed.
279       for (int z = 0; z < 4; ++z) {
280         adjusted_bias_data_0[z] = bias_data[z];
281         adjusted_bias_data_1[z] = bias_data[4 + z];
282         for (int x = 0; x < 4; ++x) {
283           adjusted_bias_data_0[z] +=
284               input_offset_difference * filter_bank_a_0[z][x];
285           adjusted_bias_data_0[z] +=
286               input_offset_difference * filter_bank_a_1[z][x];
287           adjusted_bias_data_0[z] +=
288               input_offset_difference * filter_bank_a_2[z][x];
289           adjusted_bias_data_1[z] +=
290               input_offset_difference * filter_bank_b_0[z][x];
291           adjusted_bias_data_1[z] +=
292               input_offset_difference * filter_bank_b_1[z][x];
293           adjusted_bias_data_1[z] +=
294               input_offset_difference * filter_bank_b_2[z][x];
295 
296           adjusted_bias_data[z] = adjusted_bias_data_0[z];
297           adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
298         }
299       }
300       bias_data += 2 * bias_increment;
301       adjusted_bias_data += 8;
302     }
303   }
304 };
305 
306 #ifdef USE_NEON
307 template <QuantizationType quantization_type>
308 struct ProcessPerDepth<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
309                        quantization_type> {
310   static void ProcessPerDepthIntrinsics(
311       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
312           filter_data,
313       const int32* bias_data, int8* shuffled_filter_data,
314       int32* adjusted_bias_data,
315       const DepthwiseConvDotProdParams* function_params) {
316     const int depth = function_params->output_depth;
317     const int depth_micro_repeats = function_params->depth_micro_repeats;
318     const int bias_increment = function_params->bias_increment;
319 
320     constexpr int kSymmetricZeroPoint =
321         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
322     constexpr uint8 kSignBit =
323         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
324     const int32 input_offset = function_params->input_offset;
325     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
326       TFLITE_DCHECK_GE(input_offset, -255);
327       TFLITE_DCHECK_LE(input_offset, 0);
328     }
329     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
330     const int8x16_t ones_vector = vdupq_n_s8(1);
331 
332     // Simulate NEON-register transposition of subset of filter.
333     int8x16_t input_0_a;
334     int8x16_t input_0_b;
335     int8x16_t input_0_c;
336     int8x16_t input_1_a;
337     int8x16_t input_1_b;
338     int8x16_t input_1_c;
339     int8x16_t input_2_a;
340     int8x16_t input_2_b;
341     int8x16_t input_2_c;
342 
343     int8x16_t filter_0_a;
344     int8x16_t filter_0_b;
345     int8x16_t filter_1_a;
346     int8x16_t filter_1_b;
347     int8x16_t filter_2_a;
348     int8x16_t filter_2_b;
349 
350     // For uint8, effect subtraction of zero-point = 128 by XOR of sign bit.
351     const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
352 
353     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
354         filter_block = filter_data;
355     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
356       // Filter data is provided as filter_block[3][3][depth/8][2][4].
357       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
358       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
359 
360       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
361           filter_block_ptr = filter_block;
362       input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
363       filter_block_ptr += depth;
364       input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
365       filter_block_ptr += depth;
366       input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
367       filter_block_ptr += depth;
368       input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
369       filter_block_ptr += depth;
370       input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
371       filter_block_ptr += depth;
372       input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
373       filter_block_ptr += depth;
374       input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
375       filter_block_ptr += depth;
376       input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
377       filter_block_ptr += depth;
378       input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
379 
380       filter_0_a = vzip1q_s8(input_0_a, input_0_b);
381       filter_0_b = vzip1q_s8(input_0_c, sign_bit);
382       filter_1_a = vzip1q_s8(input_1_a, input_1_b);
383       filter_1_b = vzip1q_s8(input_1_c, sign_bit);
384       filter_2_a = vzip1q_s8(input_2_a, input_2_b);
385       filter_2_b = vzip1q_s8(input_2_c, sign_bit);
386       if (quantization_type == QuantizationType::kNonPerChannelUint8) {
387         filter_0_a = veorq_s8(filter_0_a, sign_bit);
388         filter_0_b = veorq_s8(filter_0_b, sign_bit);
389         filter_1_a = veorq_s8(filter_1_a, sign_bit);
390         filter_1_b = veorq_s8(filter_1_b, sign_bit);
391         filter_2_a = veorq_s8(filter_2_a, sign_bit);
392         filter_2_b = veorq_s8(filter_2_b, sign_bit);
393       }
394       vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
395       vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
396       vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
397 
398       vst1q_s8(shuffled_filter_data, filter_0_a);
399       shuffled_filter_data += 16;
400       vst1q_s8(shuffled_filter_data, filter_0_b);
401       shuffled_filter_data += 16;
402       vst1q_s8(shuffled_filter_data, filter_1_a);
403       shuffled_filter_data += 16;
404       vst1q_s8(shuffled_filter_data, filter_1_b);
405       shuffled_filter_data += 16;
406       vst1q_s8(shuffled_filter_data, filter_2_a);
407       shuffled_filter_data += 16;
408       vst1q_s8(shuffled_filter_data, filter_2_b);
409       shuffled_filter_data += 16;
410 
411       int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
412       bias_data += bias_increment;
413       int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
414       bias_data += bias_increment;
415       // For instance, if input_offset is kIntSymmetricZeroPoint, no adjustment
416       // is needed.
417 
418       int32x4_t filter_sum_a = vdupq_n_s32(0);
419       filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
420       filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
421       filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
422       int32x4_t filter_sum_b = vdupq_n_s32(0);
423       filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
424       filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
425       filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
426 
427       adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
428                                          input_offset_difference);
429       adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
430                                          input_offset_difference);
431 
432       vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
433       adjusted_bias_data += 4;
434       vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
435       adjusted_bias_data += 4;
436 
437       filter_block += 8;
438     }
439   }
440 
441   static inline void Run(const typename QuantizationTypeImpl<
442                              quantization_type>::ExternalType* filter_data,
443                          const int32* bias_data, int8* shuffled_filter_data,
444                          int32* adjusted_bias_data,
445                          const DepthwiseConvDotProdParams* function_params) {
446     ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
447                               adjusted_bias_data, function_params);
448   }
449 };
450 #endif
451 
452 template <QuantizationType quantization_type, int32 max_padding>
453 struct PackMacroBlock<
454     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
455     DepthwiseConvDepthMultiplication::kNoMultiplication, max_padding> {
456   // A straight copy of a macro block of input data into a scratch buffer.
457   //
458   // Requirement: depth_micro_repeats > 0.
459   static inline void CopyMacroBlock(
460       int32 height_block_number, int32 width_block_number,
461       const DepthwiseConvDotProdParams& function_params,
462       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
463           input_block_data,
464       int8* scratch_block_data) {
465     TFLITE_DCHECK_LE(max_padding, 1);
466 
467     // Strides.
468     // The input depth and count of micro blocks provide the width strides.
469     const int input_height_stride = function_params.input_height_stride;
470     const int workspace_height_stride = function_params.workspace_height_stride;
471     const int input_depth = function_params.input_depth;
472     const int depth_micro_repeats = function_params.depth_micro_repeats;
473     TFLITE_DCHECK_GT(depth_micro_repeats, 0);
474 
475     // Remaining iteration and dimension parameters.
476     //
477     // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
478     // final micro block is incomplete.
479     const int width_overall_micro_repeats =
480         function_params.input_width_overall_micro_repeats;
481     int input_width_micro_repeats = function_params.input_width_micro_repeats;
482     const int residual_width = function_params.residual_width;
483     const int block_height = function_params.inbound_block_height;
484 
485     const int padding_left = function_params.padding_left;
486     const int padding_right = function_params.padding_right;
487     const int padding_top = function_params.padding_top;
488     const int padding_bottom = function_params.padding_bottom;
489 
490     const bool leading_width_padding =
491         padding_left > 0 && width_block_number == 0;
492     const bool trailing_width_padding =
493         padding_right > 0 &&
494         width_block_number == (function_params.width_macro_count - 1);
495     const bool leading_height_padding =
496         padding_top > 0 && height_block_number < 0;
497     const bool trailing_height_padding =
498         padding_bottom > 0 &&
499         height_block_number == (function_params.height_macro_count - 1);
500 
501     // Modify the trailing case to reflect the input width.
502     int input_residual_width =
503         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
504                                                                 : 4;
505     if (trailing_width_padding) {
506       input_residual_width -= 1;
507       input_width_micro_repeats = width_overall_micro_repeats - 1;
508     }
509 
510     constexpr int kSymmetricZeroPoint =
511         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
512     const int32 input_offset_difference =
513         function_params.input_offset + kSymmetricZeroPoint;
514 
515     // We load data into a temporary buffer and then save, to match subsequent
516     // processing. This will make it easier to combine stages into one ASM
517     // routine.
518     int8 tmp_load[4][2][4];
519 
520     int copy_block_height = block_height;
521     if (leading_height_padding) {
522       memset(scratch_block_data, -input_offset_difference,
523              workspace_height_stride);
524       scratch_block_data += workspace_height_stride;
525       input_block_data += input_height_stride;
526       copy_block_height -= 1;
527     }
528     if (trailing_height_padding) {
529       copy_block_height -= 1;
530     }
531 
532     // The outer 3 loops go through all the micro blocks in a macro block.
533     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
534       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
535         // Figure out division of work (available input vs trailing padding).
536         int adjusted_residual_width =
537             j_width == input_width_micro_repeats ? input_residual_width : 4;
538 
539         int start_width = 0;
540         if (leading_width_padding && j_width == 0) {
541           start_width = 1;
542           memset(tmp_load[0][0], -input_offset_difference, 8);
543         }
544         if (adjusted_residual_width < 4) {
545           for (int x = adjusted_residual_width; x < 4; ++x) {
546             memset(tmp_load[x][0], -input_offset_difference, 8);
547           }
548         }
549 
550         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
551           // The inner 3 loops go through the sub-block, depth and width within
552           // each micro block.
553 
554           // Load, and apply symmetric offset.
555           int8* scratch_data =
556               scratch_block_data + k_height * workspace_height_stride +
557               j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
558           const typename QuantizationTypeImpl<quantization_type>::ExternalType*
559               input_data = input_block_data + k_height * input_height_stride +
560                            j_width * 4 * input_depth + i_depth * 8;
561           // Full-size macro blocks are 2*4*4 = 32 bytes.
562           for (int x = start_width; x < adjusted_residual_width; ++x) {
563             for (int s = 0; s < 2; ++s) {
564               for (int d = 0; d < 4; ++d) {
565                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
566                                     kSymmetricZeroPoint;
567               }
568             }
569           }
570 
571           // Save results.
572           memcpy(&scratch_data[0], tmp_load[0][0], 8);
573           memcpy(&scratch_data[8], tmp_load[1][0], 8);
574           memcpy(&scratch_data[16], tmp_load[2][0], 8);
575           memcpy(&scratch_data[24], tmp_load[3][0], 8);
576         }
577       }
578     }
579 
580     if (trailing_height_padding) {
581       memset(scratch_block_data + copy_block_height * workspace_height_stride,
582              -input_offset_difference, workspace_height_stride);
583     }
584   }
585 
586   // Transpose 4x4 blocks within each sub-micro-block.
587   //
588   // Implemented somewhat like NEON register manipulation, so that we can see
589   // equivalence of the two approaches.
590   static inline void MicroTransposeBlocks(
591       const DepthwiseConvDotProdParams& function_params,
592       int8* scratch_block_data) {
593     const int workspace_height_stride = function_params.workspace_height_stride;
594     const int width_overall_micro_repeats =
595         function_params.input_width_overall_micro_repeats;
596     const int depth_micro_repeats = function_params.depth_micro_repeats;
597     const int block_height = function_params.inbound_block_height;
598 
599     // Transpositions are 4x4, but doing 2 at a time is more efficient in the
600     // NEON code we are simulating.
601     int8 tmp_load[4][2][4];         // [width][sub-block][depth]
602     int8 tmp_transposed[4][2][4];   // [depth][sub-block][width]
603     int8 tmp_interleaved[2][4][4];  // [sub-block][depth][width]
604 
605     // The outer 3 loops go through all the micro blocks in a macro block.
606     for (int k_height = 0; k_height < block_height; ++k_height) {
607       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
608         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
609           int8* scratch_data =
610               scratch_block_data + k_height * workspace_height_stride +
611               j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
612           // A. Load data
613           memcpy(tmp_load[0][0], &scratch_data[0], 8);
614           memcpy(tmp_load[1][0], &scratch_data[8], 8);
615           memcpy(tmp_load[2][0], &scratch_data[16], 8);
616           memcpy(tmp_load[3][0], &scratch_data[24], 8);
617 
618           // B. Simulate between-register transposition.
619           for (int x = 0; x < 4; ++x) {
620             for (int y = 0; y < 4; ++y) {
621               tmp_transposed[x][0][y] = tmp_load[y][0][x];
622               tmp_transposed[x][1][y] = tmp_load[y][1][x];
623             }
624           }
625 
626           // C. Simulate between-register interleaving.
627           for (int x = 0; x < 4; ++x) {
628             for (int y = 0; y < 4; ++y) {
629               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
630               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
631             }
632           }
633           // D. Simulate mangled storage arrangement.
634           memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
635           memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
636         }
637       }
638     }
639   }
640 
641   static inline void Run(
642       int32 height_block_number, int32 width_block_number,
643       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
644           input_block_data,
645       int8* scratch_block_data,
646       const DepthwiseConvDotProdParams* function_params) {
647     CopyMacroBlock(height_block_number, width_block_number, *function_params,
648                    input_block_data, scratch_block_data);
649     MicroTransposeBlocks(*function_params, scratch_block_data);
650   }
651 };
652 
653 template <QuantizationType quantization_type, int32 max_padding>
654 struct PackMacroBlock<
655     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
656     DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
657   static inline void Run(
658       int32 height_block_number, int32 width_block_number,
659       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
660           input_block_data,
661       int8* scratch_block_data,
662       const DepthwiseConvDotProdParams* function_params) {
663     // Currently support for padding is limited to 1 on any side.
664     TFLITE_DCHECK_LE(max_padding, 1);
665 
666     // Strides.
667     // The count of micro blocks (below) provides the width strides.
668     const int input_height_stride = function_params->input_height_stride;
669     const int workspace_height_stride =
670         function_params->workspace_height_stride;
671 
672     // Remaining iteration and dimension parameters.
673     //
674     // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
675     // final micro block is incomplete.
676     const int width_overall_micro_repeats =
677         function_params->input_width_overall_micro_repeats;
678     const int input_width_micro_repeats =
679         function_params->input_width_micro_repeats;
680     const int residual_width = function_params->residual_width;
681     const int block_height = function_params->inbound_block_height;
682     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
683 
684     const int padding_left = function_params->padding_left;
685     const int padding_right = function_params->padding_right;
686     const int padding_top = function_params->padding_top;
687     const int padding_bottom = function_params->padding_bottom;
688 
689     const bool leading_width_padding =
690         padding_left > 0 && width_block_number == 0;
691     const bool trailing_width_padding =
692         padding_right > 0 &&
693         width_block_number == (function_params->width_macro_count - 1);
694     const bool leading_height_padding =
695         padding_top > 0 && height_block_number < 0;
696     const bool trailing_height_padding =
697         padding_bottom > 0 &&
698         height_block_number == (function_params->height_macro_count - 1);
699 
700     constexpr int kSymmetricZeroPoint =
701         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
702     const int32 input_offset_difference =
703         function_params->input_offset + kSymmetricZeroPoint;
704 
705     int copy_block_height = block_height;
706     if (leading_height_padding) {
707       memset(scratch_block_data, -input_offset_difference,
708              workspace_height_stride + kWorkspaceExtension);
709       scratch_block_data += workspace_height_stride;
710       input_block_data += input_height_stride;
711       copy_block_height -= 1;
712     }
713     if (trailing_height_padding) {
714       copy_block_height -= 1;
715     }
716 
717     int adjusted_residual_width =
718         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
719                                                                 : 4;
720 
721     if (trailing_width_padding) {
722       adjusted_residual_width -= 1;
723     }
724     int start_width = 0;
725     if (leading_width_padding) {
726       start_width = 1;
727       input_block_data += 1;
728     }
729 
730     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
731                           adjusted_residual_width - start_width;
732 
733     TFLITE_DCHECK_LE(
734         copy_size,
735         input_height_stride - width_block_number * input_width_micro_repeats);
736     // We may drop up to stride-1 of trailing input.
737     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
738 
739     // When there is unit input depth, the micro-block iteration need only be
740     // through the height. The micro blocks are contiguous across the width.
741     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
742       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
743           input_data = input_block_data + k_height * input_height_stride;
744       int8* scratch_data =
745           scratch_block_data + k_height * workspace_height_stride;
746 
747       // Handle leading padding. This is overwritten if there is no padding.
748       scratch_data[0] = -input_offset_difference;
749 
750       memcpy(&scratch_data[start_width], input_data, copy_size);
751       for (int i = 0; i < copy_size; ++i) {
752         scratch_data[start_width + i] += -kSymmetricZeroPoint;
753       }
754 
755       // Handle trailing padding, and fill in remainder of micro block.
756       memset(&scratch_data[start_width + copy_size], -input_offset_difference,
757              4 - adjusted_residual_width + kWorkspaceExtension);
758     }
759 
760     if (trailing_height_padding) {
761       memset(scratch_block_data + copy_block_height * workspace_height_stride,
762              -input_offset_difference,
763              workspace_height_stride + kWorkspaceExtension);
764     }
765   }
766 };
767 
768 // Beginning of code section containing intermediate code transformation.
769 //
770 // This section is only compiled when kUseUnwound3x3DotProduct versions of
771 // templated functions are selected.
772 template <QuantizationType quantization_type>
773 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
774                       quantization_type,
775                       DepthwiseConvDepthMultiplication::kNoMultiplication,
776                       /*max_padding=*/0> {
777   static inline void Run(
778       int32 height_block_number, int32 width_block_number,
779       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
780           input_block_data,
781       int8* scratch_block_data,
782       const DepthwiseConvDotProdParams* function_params) {
783     const int workspace_height_stride =
784         function_params->workspace_height_stride;
785     const int width_overall_micro_repeats =
786         function_params->input_width_overall_micro_repeats;
787     const int input_width_micro_repeats =
788         function_params->input_width_micro_repeats;
789     const int depth_micro_repeats = function_params->depth_micro_repeats;
790     const int block_height = function_params->inbound_block_height;
791     const int residual_width = function_params->residual_width;
792     const int input_height_stride = function_params->input_height_stride;
793     const int input_depth = function_params->input_depth;
794 
795     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
796     constexpr int kSymmetricZeroPoint =
797         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
798     const int micro_block_size = 4 * 8;
799     const int depth_advance = width_overall_micro_repeats * micro_block_size;
800     const int width_advance =
801         micro_block_size *
802         (1 - depth_micro_repeats * width_overall_micro_repeats);
803     const int height_advance = workspace_height_stride -
804                                width_overall_micro_repeats * micro_block_size;
805     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
806 
807     // Transpositions are 4x4, but doing 2 at a time is more efficient in the
808     // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
809     // down the depth.
810     int8 tmp_load[4][2][4];
811     int8 tmp_transposed[4][2][4];
812     int8 tmp_interleaved[2][4][4];
813 
814     // Work through one slice, by row, at a time.
815     int8* scratch_data = scratch_block_data;
816     for (int k_height = 0; k_height < block_height; ++k_height) {
817       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
818           input_data = input_block_data;
819       input_block_data += input_height_stride;
820 
821       // Traverse the width one point at a time, but the depth in (micro) blocks
822       // of size 8.
823       //
824       // The depth and width margins, which are filled with "zeros", may be
825       // larger than is strictly needed to calculate output. This is because the
826       // conv calculation is performed across complete micro blocks.
827       for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
828         // Load, then zero.
829         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
830           // A. Simulate register loading.
831           for (int x = 0; x < 4; ++x) {
832             for (int s = 0; s < 2; ++s) {
833               for (int d = 0; d < 4; ++d) {
834                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
835                                     kSymmetricZeroPoint;
836               }
837             }
838           }
839           // B. Simulate between-register transposition.
840           for (int x = 0; x < 4; ++x) {
841             for (int y = 0; y < 4; ++y) {
842               tmp_transposed[x][0][y] = tmp_load[y][0][x];
843               tmp_transposed[x][1][y] = tmp_load[y][1][x];
844             }
845           }
846 
847           // C and D are to be performed together as 4-byte stores in NEON code.
848           // C. Simulate between-register interleaving.
849           for (int x = 0; x < 4; ++x) {
850             for (int y = 0; y < 4; ++y) {
851               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
852               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
853             }
854           }
855           // D. Simulate mangled storage arrangement.
856           memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
857           memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
858           memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
859           memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
860 
861           scratch_data += depth_advance;
862           input_data += 8;
863         }
864         scratch_data += width_advance;
865         input_data += input_depth_skip;
866       }
867       if (width_overall_micro_repeats > input_width_micro_repeats) {
868         TFLITE_DCHECK_EQ(width_overall_micro_repeats,
869                          input_width_micro_repeats + 1);
870         TFLITE_DCHECK_GT(residual_width, 0);
871         // Figure out division of work (available input vs zero-ed).
872         const int adjusted_residual_width = residual_width;
873         // Load, then zero.
874         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
875           // A. Simulate register loading.
876           for (int x = 0; x < adjusted_residual_width; ++x) {
877             for (int s = 0; s < 2; ++s) {
878               for (int d = 0; d < 4; ++d) {
879                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
880                                     kSymmetricZeroPoint;
881               }
882             }
883           }
884           for (int x = adjusted_residual_width; x < 4; ++x) {
885             for (int s = 0; s < 2; ++s) {
886               for (int d = 0; d < 4; ++d) {
887                 tmp_load[x][s][d] = 0;
888               }
889             }
890           }
891           // B. Simulate between-register transposition.
892           for (int x = 0; x < 4; ++x) {
893             for (int y = 0; y < 4; ++y) {
894               tmp_transposed[x][0][y] = tmp_load[y][0][x];
895               tmp_transposed[x][1][y] = tmp_load[y][1][x];
896             }
897           }
898 
899           // C and D are to be performed together as 4-byte stores in NEON code.
900           // C. Simulate between-register interleaving.
901           for (int x = 0; x < 4; ++x) {
902             for (int y = 0; y < 4; ++y) {
903               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
904               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
905             }
906           }
907           // D. Simulate mangled storage arrangement.
908           memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
909           memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
910           memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
911           memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
912 
913           scratch_data += depth_advance;
914           input_data += 8;
915         }
916         scratch_data += width_advance;
917         input_data += input_depth_skip;
918       }
919       scratch_data += height_advance;
920     }
921 
922     TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
923                                        block_height * workspace_height_stride);
924   }
925 };
926 
927 template <QuantizationType quantization_type>
928 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
929                       quantization_type,
930                       DepthwiseConvDepthMultiplication::kNoMultiplication,
931                       /*max_padding=*/1> {
932   static inline void Run(
933       int32 height_block_number, int32 width_block_number,
934       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
935           input_block_data,
936       int8* scratch_block_data,
937       const DepthwiseConvDotProdParams* function_params) {
938     // Just use C model code for case of padding. Optimized versions merge the
939     // modifications therein to handle padding.
940     PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
941                    quantization_type,
942                    DepthwiseConvDepthMultiplication::kNoMultiplication,
943                    /*max_padding=*/1>::Run(height_block_number,
944                                            width_block_number, input_block_data,
945                                            scratch_block_data, function_params);
946   }
947 };
948 
949 template <QuantizationType quantization_type, int32 max_padding>
950 struct PackMacroBlock<
951     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
952     DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
953   static inline void Run(
954       int32 height_block_number, int32 width_block_number,
955       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
956           input_block_data,
957       int8* scratch_block_data,
958       const DepthwiseConvDotProdParams* function_params) {
959     const int workspace_height_stride =
960         function_params->workspace_height_stride;
961     const int width_overall_micro_repeats =
962         function_params->input_width_overall_micro_repeats;
963     const int input_width_micro_repeats =
964         function_params->input_width_micro_repeats;
965     const int block_height = function_params->inbound_block_height;
966     const int residual_width = function_params->residual_width;
967     const int input_height_stride = function_params->input_height_stride;
968 
969     const int padding_left = function_params->padding_left;
970     const int padding_right = function_params->padding_right;
971     const int padding_top = function_params->padding_top;
972     const int padding_bottom = function_params->padding_bottom;
973 
974     constexpr int kSymmetricZeroPoint =
975         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
976 
977     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
978 
979     const bool leading_width_padding =
980         padding_left > 0 && width_block_number == 0;
981     const bool trailing_width_padding =
982         padding_right > 0 &&
983         width_block_number == (function_params->width_macro_count - 1);
984     const bool leading_height_padding =
985         padding_top > 0 && height_block_number < 0;
986     const bool trailing_height_padding =
987         padding_bottom > 0 &&
988         height_block_number == (function_params->height_macro_count - 1);
989 
990     const int32 input_offset = function_params->input_offset;
991     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
992 
993     // Work through one slice, by row, at a time.
994     int8* scratch_data_base = scratch_block_data;
995 
996     int copy_block_height = block_height;
997     if (leading_height_padding) {
998       copy_block_height -= 1;
999       memset(scratch_data_base, -input_offset_difference,
1000              workspace_height_stride + kWorkspaceExtension);
1001       scratch_data_base += workspace_height_stride;
1002       input_block_data += input_height_stride;
1003     }
1004     if (trailing_height_padding) {
1005       copy_block_height -= 1;
1006     }
1007 
1008     int adjusted_residual_width =
1009         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1010                                                                 : 4;
1011 
1012     if (trailing_width_padding) {
1013       adjusted_residual_width -= 1;
1014     }
1015     int start_width = 0;
1016     if (leading_width_padding) {
1017       start_width = 1;
1018       input_block_data += 1;
1019     }
1020 
1021     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1022                           adjusted_residual_width - start_width;
1023     // Adjusted so that later conditionals are simplified.
1024     const int copy_size_adjusted =
1025         trailing_width_padding ? copy_size + 1 : copy_size;
1026 
1027     TFLITE_DCHECK_LE(
1028         copy_size,
1029         input_height_stride - width_block_number * input_width_micro_repeats);
1030     // We may drop up to stride-1 of trailing input.
1031     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1032 
1033     // This is used to simulate what should happen in registers.
1034     int8 tmp_data[16];
1035 
1036     int scratch_data_offset = 0;
1037     int input_block_offset = 0;
1038 
1039     if (copy_size >= 16) {
1040       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1041         // Work through one slice, by row, at a time.
1042         int8* scratch_data = scratch_data_base + scratch_data_offset;
1043 
1044         int copy_done = 0;
1045 
1046         // The surrounding condition ensures that we always need at least one
1047         // iteration of the main copy loop. In the case of leading width
1048         // padding, we unroll this specially.
1049         if (leading_width_padding) {
1050           memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
1051           for (int i = 0; i < 16; ++i) {
1052             tmp_data[i] += -kSymmetricZeroPoint;
1053           }
1054           tmp_data[0] = -input_offset_difference;
1055           memcpy(scratch_data, tmp_data, 16);
1056           copy_done += 15;
1057         }
1058 
1059         // Main copy loop.
1060         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
1061           memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1062                  16);
1063           for (int i = 0; i < 16; ++i) {
1064             tmp_data[i] += -kSymmetricZeroPoint;
1065           }
1066           TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
1067           memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
1068         }
1069 
1070         const int copy_remaining = copy_size - copy_done;
1071         // Total amount
1072         // = copy_size - copy_done + 4 - adjusted_residual_width
1073         // = width_overall_micro_repeats * 4 - start_width - copy_done.
1074         // Undone micro blocks
1075         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1076 
1077         // Conditional is (copy_remaining > 0 || trailing_width_padding).
1078         if (copy_done < copy_size_adjusted) {
1079           // Employ overlapping-load strategy in order to load full register,
1080           // but use only part.
1081           memcpy(tmp_data,
1082                  input_block_data + input_block_offset + copy_done -
1083                      (16 - copy_remaining),
1084                  16);
1085           // Shift to select the part that we need.
1086           for (int i = 0; i < copy_remaining; ++i) {
1087             tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
1088           }
1089           for (int i = 0; i < 16; ++i) {
1090             tmp_data[i] += -kSymmetricZeroPoint;
1091           }
1092           // Apply padding to remainder, some unnecessary but costless in regs.
1093           for (int i = copy_remaining; i < 16; ++i) {
1094             tmp_data[i] = -input_offset_difference;
1095           }
1096           const int final_repeats =
1097               width_overall_micro_repeats - (start_width + copy_done) / 4;
1098           for (int i = 0; i < final_repeats; ++i) {
1099             memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
1100             copy_done += 4;
1101           }
1102         }
1103         memset(scratch_data + start_width + copy_done, -input_offset_difference,
1104                kWorkspaceExtension);
1105 
1106         scratch_data_offset += workspace_height_stride;
1107         input_block_offset += input_height_stride;
1108       }
1109     } else if (copy_size >= 4) {
1110       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1111         // Work through one slice, by row, at a time.
1112         int8* scratch_data = scratch_data_base + scratch_data_offset;
1113 
1114         int copy_done = 0;
1115 
1116         // The surrounding condition ensures that we always need at least one
1117         // iteration of the main copy loop. In the case of leading width
1118         // padding, we unroll this specially.
1119         if (leading_width_padding) {
1120           memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
1121           for (int i = 0; i < 4; ++i) {
1122             tmp_data[i] += -kSymmetricZeroPoint;
1123           }
1124           tmp_data[0] = -input_offset_difference;
1125           memcpy(scratch_data, tmp_data, 4);
1126           copy_done += 3;
1127         }
1128 
1129         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
1130           memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1131                  4);
1132           for (int i = 0; i < 4; ++i) {
1133             tmp_data[i] += -kSymmetricZeroPoint;
1134           }
1135           // Perform as 4 int32 stores, because that is our alignment.
1136           memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1137         }
1138 
1139         // Total amount
1140         // = copy_size - copy_done + 4 - adjusted_residual_width
1141         // = width_overall_micro_repeats * 4 - start_width - copy_done.
1142         // Undone micro blocks
1143         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1144         const int copy_remaining = copy_size - copy_done;
1145         // Conditional is (copy_remaining > 0 || trailing_width_padding).
1146         if (copy_done < copy_size_adjusted) {
1147           TFLITE_DCHECK_LT(copy_remaining, 4);
1148           // Employ overlapping-load strategy in order to load full register,
1149           // but use only part.
1150           memcpy(tmp_data,
1151                  input_block_data + input_block_offset + copy_done -
1152                      (4 - copy_remaining),
1153                  4);
1154           // Shift to select the part that we need.
1155           for (int i = 0; i < copy_remaining; ++i) {
1156             tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
1157           }
1158           for (int i = 0; i < 4; ++i) {
1159             tmp_data[i] += -kSymmetricZeroPoint;
1160           }
1161           // Apply padding to remainder, some unnecessary but costless in regs.
1162           for (int i = copy_remaining; i < 4; ++i) {
1163             tmp_data[i] = -input_offset_difference;
1164           }
1165           memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1166           copy_done += 4;
1167         }
1168         memset(scratch_data + start_width + copy_done, -input_offset_difference,
1169                kWorkspaceExtension);
1170 
1171         scratch_data_offset += workspace_height_stride;
1172         input_block_offset += input_height_stride;
1173       }
1174     } else if (width_overall_micro_repeats == 2) {
1175       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1176         // Apply padding by quick fill of whole reg.
1177         for (int i = 0; i < 8; ++i) {
1178           tmp_data[i] = -input_offset;
1179         }
1180         for (int i = 0; i < copy_size; ++i) {
1181           // Apply shift-left insert, tmp_data as both operands.
1182           // The zero-index byte is left unchanged.
1183           for (int i = 7; i > 0; --i) {
1184             tmp_data[i] = tmp_data[i - 1];
1185           }
1186           tmp_data[1] =
1187               input_block_data[input_block_offset + (copy_size - 1 - i)];
1188         }
1189         if (!leading_width_padding) {
1190           // Remove leading padding, junking trailing byte, OK because max size
1191           // is less than 8.
1192           TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1193           for (int i = 0; i < 7; ++i) {
1194             tmp_data[i] = tmp_data[i + 1];
1195           }
1196         }
1197         for (int i = 0; i < 8; ++i) {
1198           tmp_data[i] += -kSymmetricZeroPoint;
1199         }
1200         memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
1201         memset(scratch_data_base + scratch_data_offset + 8,
1202                -input_offset_difference, kWorkspaceExtension);
1203 
1204         scratch_data_offset += workspace_height_stride;
1205         input_block_offset += input_height_stride;
1206       }
1207     } else {
1208       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
1209       // This path is basically the same as the preceding, 2-micro-block one,
1210       // but here we simply store fewer bytes.
1211       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1212         // Apply padding by quick fill of whole reg.
1213         for (int i = 0; i < 8; ++i) {
1214           tmp_data[i] = -input_offset;
1215         }
1216         for (int i = 0; i < copy_size; ++i) {
1217           // Apply shift-left insert, tmp_data as both operands.
1218           // The zero-index byte is left unchanged.
1219           for (int i = 7; i > 0; --i) {
1220             tmp_data[i] = tmp_data[i - 1];
1221           }
1222           tmp_data[1] =
1223               input_block_data[input_block_offset + (copy_size - 1 - i)];
1224         }
1225         if (!leading_width_padding) {
1226           // Remove leading padding, junking trailing byte, OK because max size
1227           // is less than 8.
1228           TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1229           for (int i = 0; i < 7; ++i) {
1230             tmp_data[i] = tmp_data[i + 1];
1231           }
1232         }
1233         for (int i = 0; i < 8; ++i) {
1234           tmp_data[i] += -kSymmetricZeroPoint;
1235         }
1236         memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
1237         memset(scratch_data_base + scratch_data_offset + 4,
1238                -input_offset_difference, kWorkspaceExtension);
1239 
1240         scratch_data_offset += workspace_height_stride;
1241         input_block_offset += input_height_stride;
1242       }
1243     }
1244 
1245     scratch_data_base += copy_block_height * workspace_height_stride;
1246 
1247     if (trailing_height_padding) {
1248       memset(scratch_data_base, -input_offset_difference,
1249              workspace_height_stride + kWorkspaceExtension);
1250       scratch_data_base += workspace_height_stride;
1251     }
1252 
1253     TFLITE_DCHECK_EQ(
1254         scratch_data_base,
1255         scratch_block_data + block_height * workspace_height_stride);
1256   }
1257 };
1258 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
1259 // of templated functions are selected.
1260 //
1261 // End of code section containing intermediate code transformation.
1262 
1263 #ifdef USE_NEON
1264 template <QuantizationType quantization_type>
1265 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1266                       quantization_type,
1267                       DepthwiseConvDepthMultiplication::kNoMultiplication,
1268                       /*max_padding=*/0> {
1269   static inline void PackMacroBlockIntrinsics(
1270       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1271           input_block_data,
1272       int8* scratch_block_data,
1273       const DepthwiseConvDotProdParams* function_params) {
1274     TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
1275     TFLITE_DCHECK_EQ(function_params->padding_top, 0);
1276     TFLITE_DCHECK_EQ(function_params->padding_left, 0);
1277     TFLITE_DCHECK_EQ(function_params->padding_right, 0);
1278     const int workspace_height_stride =
1279         function_params->workspace_height_stride;
1280     const int width_overall_micro_repeats =
1281         function_params->input_width_overall_micro_repeats;
1282     const int input_width_micro_repeats =
1283         function_params->input_width_micro_repeats;
1284     const int depth_micro_repeats = function_params->depth_micro_repeats;
1285     const int block_height = function_params->inbound_block_height;
1286     const int residual_width = function_params->residual_width;
1287     const int input_height_stride = function_params->input_height_stride;
1288     const int input_depth = function_params->input_depth;
1289 
1290     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
1291     constexpr uint8 kSignBit =
1292         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1293     const int micro_block_size = 4 * 8;
1294     const int depth_advance = width_overall_micro_repeats * micro_block_size;
1295     const int width_advance =
1296         micro_block_size *
1297         (1 - depth_micro_repeats * width_overall_micro_repeats);
1298     const int height_advance = workspace_height_stride -
1299                                width_overall_micro_repeats * micro_block_size;
1300     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1301 
1302     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1303     // code. Note the blocks of 4x4 are still interleaved down the depth.
1304     int8x16_t work_reg_a;
1305     int8x16_t work_reg_b;
1306 
1307     // Effect subtraction of zero-point = 128 by XOR of sign bit.
1308     const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1309 
1310     // Work through one slice, by row, at a time.
1311     int8* scratch_data_0 = scratch_block_data;
1312 
1313     for (int k_height = 0; k_height < block_height; ++k_height) {
1314       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1315           input_data_0 = input_block_data;
1316       int8x16_t input_data_a;
1317       int8x16_t input_data_b;
1318       int8x16_t input_data_c;
1319       int8x16_t input_data_d;
1320 
1321       // Traverse the width one point at a time, but the depth in (micro) blocks
1322       // of size 8.
1323       //
1324       // The depth and width margins, which are filled with "zeros", may be
1325       // larger than is strictly needed to calculate output. This is because the
1326       // conv calculation is performed across complete micro blocks.
1327       for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
1328         int8x16_t work_reg_a_sp;
1329         int8x16_t work_reg_b_sp;
1330 
1331         int i_depth = 0;
1332 
1333         if (depth_micro_repeats >= 2) {
1334           i_depth += 2;
1335 
1336           input_data_a = util_vld1q_x8(input_data_0);
1337           input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1338           input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1339           input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1340           input_data_0 += 16;
1341 
1342           for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1343             work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1344             work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1345             vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1346             if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1347               work_reg_a = veorq_s8(work_reg_a, sign_bit);
1348               work_reg_b = veorq_s8(work_reg_b, sign_bit);
1349             }
1350 
1351             work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1352             work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1353             vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1354 
1355             input_data_a = util_vld1q_x8(input_data_0);
1356             input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1357             vst1q_s8(scratch_data_0, work_reg_a);
1358             vst1q_s8(scratch_data_0 + 16, work_reg_b);
1359 
1360             scratch_data_0 += depth_advance;
1361 
1362             if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1363               work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1364               work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1365             }
1366 
1367             input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1368             input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1369             vst1q_s8(scratch_data_0, work_reg_a_sp);
1370             vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1371 
1372             scratch_data_0 += depth_advance;
1373             input_data_0 += 16;
1374           }
1375 
1376           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1377           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1378           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1379           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1380             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1381             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1382           }
1383           vst1q_s8(scratch_data_0, work_reg_a);
1384           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1385 
1386           scratch_data_0 += depth_advance;
1387 
1388           work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1389           work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1390           vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1391           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1392             work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1393             work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1394           }
1395 
1396           vst1q_s8(scratch_data_0, work_reg_a_sp);
1397           vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1398 
1399           scratch_data_0 += depth_advance;
1400         }
1401         for (; i_depth < depth_micro_repeats; ++i_depth) {
1402           input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1403           input_data_b =
1404               vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
1405           input_data_c =
1406               vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
1407           input_data_d =
1408               vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
1409           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1410           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1411 
1412           input_data_0 += 8;
1413 
1414           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1415           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1416             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1417             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1418           }
1419 
1420           vst1q_s8(scratch_data_0, work_reg_a);
1421           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1422 
1423           scratch_data_0 += depth_advance;
1424         }
1425         scratch_data_0 += width_advance;
1426         input_data_0 += input_depth_skip;
1427       }
1428       if (width_overall_micro_repeats > input_width_micro_repeats) {
1429         TFLITE_DCHECK_EQ(width_overall_micro_repeats,
1430                          input_width_micro_repeats + 1);
1431         TFLITE_DCHECK_GT(residual_width, 0);
1432         TFLITE_DCHECK_LT(residual_width, 4);
1433         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1434           input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1435           input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1436           input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1437           if (residual_width > 1) {
1438             input_data_b =
1439                 vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
1440             if (residual_width == 3) {
1441               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1442                                              input_data_c, 0);
1443             }
1444           }
1445           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1446           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1447 
1448           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1449             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1450             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1451           }
1452           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1453 
1454           vst1q_s8(scratch_data_0, work_reg_a);
1455           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1456 
1457           scratch_data_0 += depth_advance;
1458           input_data_0 += 8;
1459         }
1460         scratch_data_0 += width_advance;
1461         input_data_0 += input_depth_skip;
1462       }
1463 
1464       scratch_data_0 += height_advance;
1465       input_block_data += input_height_stride;
1466     }
1467     TFLITE_DCHECK_EQ(
1468         scratch_data_0,
1469         scratch_block_data + block_height * workspace_height_stride);
1470   }
1471 
1472   static inline void Run(
1473       int32 height_block_number, int32 width_block_number,
1474       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1475           input_block_data,
1476       int8* scratch_block_data,
1477       const DepthwiseConvDotProdParams* function_params) {
1478 #ifdef __aarch64__
1479     PreloadInputBlock(input_block_data, function_params);
1480 #endif
1481     PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
1482                              function_params);
1483   }
1484 };
1485 
1486 template <QuantizationType quantization_type>
1487 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1488                       quantization_type,
1489                       DepthwiseConvDepthMultiplication::kNoMultiplication,
1490                       /*max_padding=*/1> {
1491   static inline void PackMacroBlockIntrinsics(
1492       int32 height_block_number, int32 width_block_number,
1493       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1494           input_block_data,
1495       int8* scratch_block_data,
1496       const DepthwiseConvDotProdParams* function_params) {
1497     constexpr uint8 kSignBit =
1498         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1499 
1500     const int workspace_height_stride =
1501         function_params->workspace_height_stride;
1502     const int width_overall_micro_repeats =
1503         function_params->input_width_overall_micro_repeats;
1504     const int input_width_micro_repeats =
1505         function_params->input_width_micro_repeats;
1506     const int depth_micro_repeats = function_params->depth_micro_repeats;
1507     const int block_height = function_params->inbound_block_height;
1508     const int residual_width = function_params->residual_width;
1509     const int input_height_stride = function_params->input_height_stride;
1510     const int input_depth = function_params->input_depth;
1511 
1512     const int padding_left = function_params->padding_left;
1513     const int padding_right = function_params->padding_right;
1514     const int padding_top = function_params->padding_top;
1515     const int padding_bottom = function_params->padding_bottom;
1516 
1517     TFLITE_DCHECK_GT(depth_micro_repeats, 0);
1518     constexpr int kSymmetricZeroPoint =
1519         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1520 
1521     const int micro_block_size = 4 * 8;
1522     const int depth_advance = width_overall_micro_repeats * micro_block_size;
1523     const int width_advance =
1524         micro_block_size *
1525         (1 - depth_micro_repeats * width_overall_micro_repeats);
1526     const int height_advance = workspace_height_stride -
1527                                width_overall_micro_repeats * micro_block_size;
1528     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1529 
1530     const bool leading_width_padding =
1531         padding_left > 0 && width_block_number == 0;
1532     const bool trailing_width_padding =
1533         padding_right > 0 &&
1534         width_block_number == (function_params->width_macro_count - 1);
1535     const bool leading_height_padding =
1536         padding_top > 0 && height_block_number < 0;
1537     const bool trailing_height_padding =
1538         padding_bottom > 0 &&
1539         height_block_number == (function_params->height_macro_count - 1);
1540 
1541     const int32 input_offset = function_params->input_offset;
1542     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1543 
1544     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1545     // code. Note the blocks of 4x4 are still interleaved down the depth.
1546     int8x16_t work_reg_a;
1547     int8x16_t work_reg_b;
1548 
1549     // Effect subtraction of zero-point = 128 by XOR of sign bit.
1550     const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1551 
1552     // Work through one slice, by row, at a time.
1553     int8* scratch_data_0 = scratch_block_data;
1554 
1555     int copy_block_height = block_height;
1556     if (leading_height_padding) {
1557       copy_block_height -= 1;
1558       memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1559       scratch_data_0 += workspace_height_stride;
1560       input_block_data += input_height_stride;
1561     }
1562     if (trailing_height_padding) {
1563       copy_block_height -= 1;
1564     }
1565 
1566     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1567       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1568           input_data_0 = input_block_data;
1569       int8x16_t input_data_a;
1570       int8x16_t input_data_b;
1571       int8x16_t input_data_c;
1572       int8x16_t input_data_d;
1573 
1574       // Traverse the width one point at a time, but the depth in (micro) blocks
1575       // of size 8.
1576       //
1577       // The depth and width margins, which are filled with "zeros", may be
1578       // larger than is strictly needed to calculate output. This is because the
1579       // conv calculation is performed across complete micro blocks.
1580       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
1581         // Figure out division of work (available input vs zero-ed).
1582         int adjusted_residual_width =
1583             j_width == (input_width_micro_repeats) ? residual_width : 4;
1584 
1585         if (trailing_width_padding &&
1586             j_width == (width_overall_micro_repeats - 1)) {
1587           adjusted_residual_width -= 1;
1588         }
1589         int start_width = 0;
1590         if (leading_width_padding && j_width == 0) {
1591           start_width = 1;
1592         }
1593         if (start_width == 0) {
1594           if (adjusted_residual_width == 4) {
1595             int8x16_t work_reg_a_sp;
1596             int8x16_t work_reg_b_sp;
1597 
1598             int i_depth = 0;
1599 
1600             if (depth_micro_repeats >= 2) {
1601               i_depth += 2;
1602 
1603               input_data_a = util_vld1q_x8(input_data_0);
1604               input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1605               input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1606               input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1607               input_data_0 += 16;
1608 
1609               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1610                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1611                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1612                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1613                 if (quantization_type ==
1614                     QuantizationType::kNonPerChannelUint8) {
1615                   work_reg_a = veorq_s8(work_reg_a, sign_bit);
1616                   work_reg_b = veorq_s8(work_reg_b, sign_bit);
1617                 }
1618 
1619                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1620                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1621                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1622 
1623                 input_data_a = util_vld1q_x8(input_data_0);
1624                 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1625                 vst1q_s8(scratch_data_0, work_reg_a);
1626                 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1627 
1628                 scratch_data_0 += depth_advance;
1629 
1630                 if (quantization_type ==
1631                     QuantizationType::kNonPerChannelUint8) {
1632                   work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1633                   work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1634                 }
1635 
1636                 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1637                 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1638                 vst1q_s8(scratch_data_0, work_reg_a_sp);
1639                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1640 
1641                 scratch_data_0 += depth_advance;
1642                 input_data_0 += 16;
1643               }
1644 
1645               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1646               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1647               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1648               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1649                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1650                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1651               }
1652               vst1q_s8(scratch_data_0, work_reg_a);
1653               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1654 
1655               scratch_data_0 += depth_advance;
1656 
1657               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1658               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1659               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1660               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1661                 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1662                 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1663               }
1664 
1665               vst1q_s8(scratch_data_0, work_reg_a_sp);
1666               vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1667 
1668               scratch_data_0 += depth_advance;
1669             }
1670             for (; i_depth < depth_micro_repeats; ++i_depth) {
1671               input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1672               input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1673                                              input_data_b, 0);
1674               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1675                                              input_data_c, 0);
1676               input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1677                                              input_data_d, 0);
1678               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1679               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1680 
1681               input_data_0 += 8;
1682 
1683               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1684               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1685                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1686                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1687               }
1688 
1689               vst1q_s8(scratch_data_0, work_reg_a);
1690               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1691 
1692               scratch_data_0 += depth_advance;
1693             }
1694             scratch_data_0 += width_advance;
1695             input_data_0 += input_depth_skip;
1696           } else {
1697             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1698             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1699               input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1700               input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1701               input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1702               input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1703               if (adjusted_residual_width > 0) {
1704                 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1705                 if (adjusted_residual_width > 1) {
1706                   input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1707                                                  input_data_b, 0);
1708                   if (adjusted_residual_width == 3) {
1709                     input_data_c = vld1q_lane_s8x8(
1710                         input_data_0 + 2 * input_depth, input_data_c, 0);
1711                   }
1712                 }
1713               }
1714               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1715               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1716 
1717               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1718                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1719                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1720               }
1721               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1722 
1723               vst1q_s8(scratch_data_0, work_reg_a);
1724               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1725 
1726               scratch_data_0 += depth_advance;
1727               input_data_0 += 8;
1728             }
1729             scratch_data_0 += width_advance;
1730             input_data_0 += input_depth_skip;
1731           }
1732         } else {
1733           if (adjusted_residual_width == 4) {
1734             int8x16_t work_reg_a_sp;
1735             int8x16_t work_reg_b_sp;
1736 
1737             int i_depth = 0;
1738 
1739             if (depth_micro_repeats >= 2) {
1740               i_depth += 2;
1741 
1742               input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1743               input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1744               input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1745               input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1746               input_data_0 += 16;
1747 
1748               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1749                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1750                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1751                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1752                 if (quantization_type ==
1753                     QuantizationType::kNonPerChannelUint8) {
1754                   work_reg_a = veorq_s8(work_reg_a, sign_bit);
1755                   work_reg_b = veorq_s8(work_reg_b, sign_bit);
1756                 }
1757 
1758                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1759                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1760                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1761 
1762                 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1763                 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1764                 vst1q_s8(scratch_data_0, work_reg_a);
1765                 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1766 
1767                 scratch_data_0 += depth_advance;
1768 
1769                 if (quantization_type ==
1770                     QuantizationType::kNonPerChannelUint8) {
1771                   work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1772                   work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1773                 }
1774 
1775                 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1776                 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1777                 vst1q_s8(scratch_data_0, work_reg_a_sp);
1778                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1779 
1780                 scratch_data_0 += depth_advance;
1781                 input_data_0 += 16;
1782               }
1783 
1784               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1785               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1786               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1787               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1788                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1789                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1790               }
1791               vst1q_s8(scratch_data_0, work_reg_a);
1792               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1793 
1794               scratch_data_0 += depth_advance;
1795 
1796               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1797               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1798               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1799               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1800                 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1801                 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1802               }
1803 
1804               vst1q_s8(scratch_data_0, work_reg_a_sp);
1805               vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1806 
1807               scratch_data_0 += depth_advance;
1808             }
1809             for (; i_depth < depth_micro_repeats; ++i_depth) {
1810               input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1811               input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1812                                              input_data_b, 0);
1813               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1814                                              input_data_c, 0);
1815               input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1816                                              input_data_d, 0);
1817               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1818               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1819 
1820               input_data_0 += 8;
1821 
1822               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1823               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1824                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1825                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1826               }
1827 
1828               vst1q_s8(scratch_data_0, work_reg_a);
1829               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1830 
1831               scratch_data_0 += depth_advance;
1832             }
1833             scratch_data_0 += width_advance;
1834             input_data_0 += input_depth_skip;
1835           } else {
1836             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1837 
1838             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1839               input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1840               input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1841               input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1842               input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1843               // Skip loading first column.
1844               if (adjusted_residual_width > 1) {
1845                 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1846                                                input_data_b, 0);
1847                 if (adjusted_residual_width == 3) {
1848                   input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1849                                                  input_data_c, 0);
1850                 }
1851               }
1852               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1853               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1854 
1855               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1856                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1857                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1858               }
1859               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1860 
1861               vst1q_s8(scratch_data_0, work_reg_a);
1862               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1863 
1864               scratch_data_0 += depth_advance;
1865               input_data_0 += 8;
1866             }
1867             scratch_data_0 += width_advance;
1868             input_data_0 += input_depth_skip;
1869           }
1870         }
1871       }
1872       scratch_data_0 += height_advance;
1873       input_block_data += input_height_stride;
1874     }
1875 
1876     if (trailing_height_padding) {
1877       memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1878       scratch_data_0 += workspace_height_stride;
1879     }
1880 
1881     TFLITE_DCHECK_EQ(
1882         scratch_data_0,
1883         scratch_block_data + block_height * workspace_height_stride);
1884   }
1885 
1886   static inline void Run(
1887       int32 height_block_number, int32 width_block_number,
1888       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1889           input_block_data,
1890       int8* scratch_block_data,
1891       const DepthwiseConvDotProdParams* function_params) {
1892 #ifdef __aarch64__
1893     PreloadInputBlock(input_block_data, function_params);
1894 #endif
1895 
1896     PackMacroBlockIntrinsics(height_block_number, width_block_number,
1897                              input_block_data, scratch_block_data,
1898                              function_params);
1899   }
1900 };
1901 
1902 template <QuantizationType quantization_type>
1903 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1904                       quantization_type,
1905                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
1906                       /*max_padding=*/1> {
1907   static inline void PackMacroBlockIntrinsics(
1908       int32 height_block_number, int32 width_block_number,
1909       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1910           input_block_data,
1911       int8* scratch_block_data,
1912       const DepthwiseConvDotProdParams* function_params) {
1913     const int workspace_height_stride =
1914         function_params->workspace_height_stride;
1915     const int width_overall_micro_repeats =
1916         function_params->input_width_overall_micro_repeats;
1917     const int input_width_micro_repeats =
1918         function_params->input_width_micro_repeats;
1919     const int block_height = function_params->inbound_block_height;
1920     const int residual_width = function_params->residual_width;
1921     const int input_height_stride = function_params->input_height_stride;
1922 
1923     const int padding_left = function_params->padding_left;
1924     const int padding_right = function_params->padding_right;
1925     const int padding_top = function_params->padding_top;
1926     const int padding_bottom = function_params->padding_bottom;
1927 
1928     constexpr int kSymmetricZeroPoint =
1929         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1930 
1931     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
1932 
1933     const bool leading_width_padding =
1934         padding_left > 0 && width_block_number == 0;
1935     const bool trailing_width_padding =
1936         padding_right > 0 &&
1937         width_block_number == (function_params->width_macro_count - 1);
1938     const bool leading_height_padding =
1939         padding_top > 0 && height_block_number < 0;
1940     const bool trailing_height_padding =
1941         padding_bottom > 0 &&
1942         height_block_number == (function_params->height_macro_count - 1);
1943 
1944     const int32 input_offset = function_params->input_offset;
1945     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1946 
1947     // Work through one slice, by row, at a time.
1948     int8* scratch_data_base = scratch_block_data;
1949 
1950     int copy_block_height = block_height;
1951     if (leading_height_padding) {
1952       copy_block_height -= 1;
1953       memset(scratch_data_base, -input_offset_difference,
1954              workspace_height_stride + kWorkspaceExtension);
1955       scratch_data_base += workspace_height_stride;
1956       input_block_data += input_height_stride;
1957     }
1958     if (trailing_height_padding) {
1959       copy_block_height -= 1;
1960     }
1961 
1962     int adjusted_residual_width =
1963         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1964                                                                 : 4;
1965 
1966     if (trailing_width_padding) {
1967       adjusted_residual_width -= 1;
1968     }
1969     int start_width = 0;
1970     if (leading_width_padding) {
1971       start_width = 1;
1972       input_block_data += 1;
1973     }
1974 
1975     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1976                           adjusted_residual_width - start_width;
1977     // Adjusted so that later conditionals are simplified.
1978     const int copy_size_adjusted =
1979         trailing_width_padding ? copy_size + 1 : copy_size;
1980 
1981     TFLITE_DCHECK_LE(
1982         copy_size,
1983         input_height_stride - width_block_number * input_width_micro_repeats);
1984     // We may drop up to stride-1 of trailing input.
1985     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1986 
1987     int scratch_data_offset = 0;
1988     int input_block_offset = 0;
1989 
1990     constexpr uint8 kSignBit =
1991         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1992 
1993     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1994     // code. Note the blocks of 4x4 are still interleaved down the depth.
1995     int8x16_t work_reg;
1996     int8x8_t half_work_reg;
1997     int8x8_t padding_mask;
1998 
1999     // Effect subtraction of zero-point = 128 by XOR of sign bit.
2000     const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
2001     const int8x16_t padding_reg =
2002         vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
2003     padding_mask = vdup_n_s8(-1);
2004     half_work_reg = vdup_n_s8(0);
2005 
2006     if (copy_size >= 16) {
2007       const int copy_remaining = (copy_size + start_width) & 0x7;
2008       padding_mask = vreinterpret_s8_s64(vshl_s64(
2009           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2010 
2011       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2012         // Work through one slice, by row, at a time.
2013         int8* scratch_data = scratch_data_base + scratch_data_offset;
2014 
2015         int copy_done = 0;
2016 
2017         // The surrounding condition ensures that we always need at least one
2018         // iteration of the main copy loop. In the case of leading width
2019         // padding, we unroll this specially.
2020         if (leading_width_padding) {
2021           work_reg = util_vld1q_x8(input_block_data + input_block_offset);
2022           work_reg = vextq_s8(padding_reg, work_reg, 15);
2023           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2024             work_reg = veorq_s8(work_reg, sign_bit);
2025           }
2026           vst1q_s8(scratch_data, work_reg);
2027           copy_done += 15;
2028         }
2029 
2030         // Main copy loop.
2031         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2032           work_reg =
2033               util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2034           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2035             work_reg = veorq_s8(work_reg, sign_bit);
2036           }
2037           TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
2038           vst1q_s8(scratch_data + start_width + copy_done, work_reg);
2039         }
2040 
2041         if (copy_done + 8 <= copy_size) {
2042           half_work_reg =
2043               util_vld1_x8(input_block_data + input_block_offset + copy_done);
2044           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2045             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2046           }
2047           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2048           vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2049           copy_done += 8;
2050         }
2051 
2052         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2053         // Total amount
2054         // = copy_size - copy_done + 4 - adjusted_residual_width
2055         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2056         // Undone micro blocks
2057         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2058 
2059         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2060         if (copy_done < copy_size_adjusted) {
2061           // Employ overlapping-load strategy in order to load full register,
2062           // but use only part.
2063           // This has the advantage of resulting in zeros after shifting.
2064           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2065                                        copy_size - 8);
2066 
2067           half_work_reg = vreinterpret_s8_s64(
2068               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2069                        vdup_n_s64(-8 * (8 - copy_remaining))));
2070           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2071                                   vget_low_s8(padding_reg), half_work_reg);
2072 
2073           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2074             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2075           }
2076           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2077           vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2078         }
2079 
2080         // Trailing guard.
2081         vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2082         vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
2083 
2084         scratch_data_offset += workspace_height_stride;
2085         input_block_offset += input_height_stride;
2086       }
2087     } else if (copy_size >= 4) {
2088       const int copy_remaining = (copy_size + start_width) & 0x3;
2089       padding_mask = vreinterpret_s8_s64(vshl_s64(
2090           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2091 
2092       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2093         // Work through one slice, by row, at a time.
2094         int8* scratch_data = scratch_data_base + scratch_data_offset;
2095 
2096         int copy_done = 0;
2097 
2098         // The surrounding condition ensures that we always need at least one
2099         // iteration of the main copy loop. In the case of leading width
2100         // padding, we unroll this specially.
2101         if (leading_width_padding) {
2102           half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
2103                                         half_work_reg, 0);
2104           half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
2105           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2106             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2107           }
2108           vst1_lane_s8x4(scratch_data, half_work_reg, 0);
2109           copy_done += 3;
2110         }
2111 
2112         // Main copy loop.
2113         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2114           half_work_reg =
2115               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2116                             half_work_reg, 0);
2117           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2118             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2119           }
2120           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2121           vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2122                          0);
2123         }
2124 
2125         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2126         // Total amount
2127         // = copy_size - copy_done + 4 - adjusted_residual_width
2128         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2129         // Undone micro blocks
2130         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2131 
2132         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2133         if (copy_done < copy_size_adjusted) {
2134           TFLITE_DCHECK_LT(copy_remaining, 4);
2135           // Employ overlapping-load strategy in order to load full register,
2136           // but use only part.
2137           // This has the advantage of resulting in zeros after shifting.
2138           half_work_reg = vld1_lane_8x4(
2139               input_block_data + input_block_offset + copy_size - 4,
2140               half_work_reg, 0);
2141 
2142           half_work_reg = vreinterpret_s8_s64(
2143               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2144                        vdup_n_s64(-8 * (4 - copy_remaining))));
2145           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2146                                   vget_low_s8(padding_reg), half_work_reg);
2147 
2148           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2149             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2150           }
2151           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2152           vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2153                          0);
2154           copy_done += 4;
2155         }
2156         // Trailing guard.
2157         vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2158                        0);
2159         vst1_lane_s8x4(scratch_data + start_width + copy_done + 4,
2160                        half_work_reg, 0);
2161         vst1_lane_s8x4(scratch_data + start_width + copy_done + 8,
2162                        half_work_reg, 0);
2163         vst1_lane_s8x4(scratch_data + start_width + copy_done + 12,
2164                        half_work_reg, 0);
2165 
2166         scratch_data_offset += workspace_height_stride;
2167         input_block_offset += input_height_stride;
2168       }
2169     } else if (width_overall_micro_repeats == 2) {
2170       // Special case of 1 + 3 + 1, padding + copy + padding.
2171       // This is rarely executed in practice.
2172       TFLITE_DCHECK_EQ(copy_size, 3);
2173       TFLITE_DCHECK_EQ(start_width, 1);
2174       TFLITE_DCHECK(leading_width_padding);
2175       TFLITE_DCHECK(trailing_width_padding);
2176 
2177       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2178         half_work_reg = vreinterpret_s8_u8(vdup_n_u8(-input_offset));
2179         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
2180                                          input_block_data + input_block_offset),
2181                                      half_work_reg, 1);
2182         half_work_reg =
2183             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2184                                                        input_block_offset + 1),
2185                          half_work_reg, 2);
2186         half_work_reg =
2187             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2188                                                        input_block_offset + 2),
2189                          half_work_reg, 3);
2190 
2191         if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2192           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2193         }
2194         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
2195         vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
2196 
2197         // Trailing guard.
2198         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2199                        half_work_reg, 0);
2200         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2201                        half_work_reg, 0);
2202         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2203                        half_work_reg, 0);
2204         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2205                        half_work_reg, 0);
2206 
2207         scratch_data_offset += workspace_height_stride;
2208         input_block_offset += input_height_stride;
2209       }
2210     } else {
2211       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2212       const int copy_remaining = (copy_size + start_width) & 0x3;
2213       padding_mask = vreinterpret_s8_s64(vshl_s64(
2214           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2215       if (leading_width_padding) {
2216         padding_mask = vreinterpret_s8_u8(
2217             vset_lane_u8(255, vreinterpret_u8_s8(padding_mask), 0));
2218       }
2219 
2220       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2221         for (int i = 0; i < copy_size; ++i) {
2222           half_work_reg = vreinterpret_s8_s64(
2223               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2224           half_work_reg = vld1_lane_s8(
2225               reinterpret_cast<const int8*>(
2226                   input_block_data + input_block_offset + copy_size - 1 - i),
2227               half_work_reg, 0);
2228         }
2229         if (leading_width_padding) {
2230           half_work_reg = vreinterpret_s8_s64(
2231               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2232         }
2233         half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2234                                 vget_low_s8(padding_reg), half_work_reg);
2235 
2236         if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2237           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2238         }
2239         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2240         vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2241                        0);
2242 
2243         // Trailing guard.
2244         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2245                        half_work_reg, 0);
2246         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2247                        half_work_reg, 0);
2248         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2249                        half_work_reg, 0);
2250         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2251                        half_work_reg, 0);
2252 
2253         scratch_data_offset += workspace_height_stride;
2254         input_block_offset += input_height_stride;
2255       }
2256     }
2257 
2258     scratch_data_base += copy_block_height * workspace_height_stride;
2259 
2260     if (trailing_height_padding) {
2261       memset(scratch_data_base, -input_offset_difference,
2262              workspace_height_stride + kWorkspaceExtension);
2263       scratch_data_base += workspace_height_stride;
2264     }
2265 
2266     TFLITE_DCHECK_EQ(
2267         scratch_data_base,
2268         scratch_block_data + block_height * workspace_height_stride);
2269   }
2270 
2271   static inline void Run(
2272       int32 height_block_number, int32 width_block_number,
2273       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2274           input_block_data,
2275       int8* scratch_block_data,
2276       const DepthwiseConvDotProdParams* function_params) {
2277 #ifdef __aarch64__
2278     PreloadInputBlock(input_block_data, function_params);
2279 #endif
2280 
2281     PackMacroBlockIntrinsics(height_block_number, width_block_number,
2282                              input_block_data, scratch_block_data,
2283                              function_params);
2284   }
2285 };
2286 
2287 template <QuantizationType quantization_type>
2288 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
2289                       quantization_type,
2290                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
2291                       /*max_padding=*/0> {
2292   static inline void PackMacroBlockIntrinsics(
2293       int32 height_block_number, int32 width_block_number,
2294       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2295           input_block_data,
2296       int8* scratch_block_data,
2297       const DepthwiseConvDotProdParams* function_params) {
2298     const int workspace_height_stride =
2299         function_params->workspace_height_stride;
2300     const int width_overall_micro_repeats =
2301         function_params->input_width_overall_micro_repeats;
2302     const int input_width_micro_repeats =
2303         function_params->input_width_micro_repeats;
2304     const int block_height = function_params->inbound_block_height;
2305     const int residual_width = function_params->residual_width;
2306     const int input_height_stride = function_params->input_height_stride;
2307 
2308     TFLITE_DCHECK_EQ(function_params->padding_left, 0);
2309     TFLITE_DCHECK_EQ(function_params->padding_right, 0);
2310     TFLITE_DCHECK_EQ(function_params->padding_top, 0);
2311     TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
2312 
2313     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
2314 
2315     // Work through one slice, by row, at a time.
2316     int8* scratch_data_base = scratch_block_data;
2317 
2318     const int copy_block_height = block_height;
2319 
2320     int adjusted_residual_width =
2321         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
2322                                                                 : 4;
2323 
2324     const int copy_size =
2325         (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
2326 
2327     TFLITE_DCHECK_LE(
2328         copy_size,
2329         input_height_stride - width_block_number * input_width_micro_repeats);
2330     // We may drop up to stride-1 of trailing input.
2331     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
2332 
2333     int scratch_data_offset = 0;
2334     int input_block_offset = 0;
2335 
2336     constexpr uint8 kSignBit =
2337         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
2338 
2339     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
2340     // code. Note the blocks of 4x4 are still interleaved down the depth.
2341     int8x16_t work_reg;
2342     int8x8_t half_work_reg;
2343 
2344     // Effect subtraction of zero-point = 128 by XOR of sign bit.
2345     const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
2346     half_work_reg = vdup_n_s8(0);
2347 
2348     if (copy_size >= 16) {
2349       const int copy_remaining = copy_size & 0x7;
2350 
2351       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2352         // Work through one slice, by row, at a time.
2353         int8* scratch_data = scratch_data_base + scratch_data_offset;
2354 
2355         int copy_done = 0;
2356 
2357         // Main copy loop.
2358         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2359           work_reg =
2360               util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2361           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2362             work_reg = veorq_s8(work_reg, sign_bit);
2363           }
2364           TFLITE_DCHECK_EQ(copy_done % 16, 0);
2365           vst1q_s8(scratch_data + copy_done, work_reg);
2366         }
2367 
2368         if (copy_done + 8 <= copy_size) {
2369           half_work_reg =
2370               util_vld1_x8(input_block_data + input_block_offset + copy_done);
2371           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2372             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2373           }
2374           TFLITE_DCHECK_EQ(copy_done % 8, 0);
2375           vst1_s8(scratch_data + copy_done, half_work_reg);
2376           copy_done += 8;
2377         }
2378 
2379         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2380         // Total amount
2381         // = copy_size - copy_done + 4 - adjusted_residual_width
2382         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2383         // Undone micro blocks
2384         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2385 
2386         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2387         if (copy_done < copy_size) {
2388           // Employ overlapping-load strategy in order to load full register,
2389           // but use only part.
2390           // This has the advantage of resulting in zeros after shifting.
2391           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2392                                        copy_size - 8);
2393 
2394           half_work_reg = vreinterpret_s8_s64(
2395               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2396                        vdup_n_s64(-8 * (8 - copy_remaining))));
2397 
2398           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2399             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2400           }
2401           TFLITE_DCHECK_EQ(copy_done % 8, 0);
2402           vst1_s8(scratch_data + copy_done, half_work_reg);
2403           copy_done += 8;
2404         }
2405 
2406         // Trailing guard.
2407         vst1_s8(scratch_data + copy_done, half_work_reg);
2408         vst1_s8(scratch_data + copy_done + 8, half_work_reg);
2409 
2410         scratch_data_offset += workspace_height_stride;
2411         input_block_offset += input_height_stride;
2412       }
2413     } else if (copy_size >= 4) {
2414       const int copy_remaining = copy_size & 0x3;
2415 
2416       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2417         // Work through one slice, by row, at a time.
2418         int8* scratch_data = scratch_data_base + scratch_data_offset;
2419 
2420         int copy_done = 0;
2421 
2422         // Main copy loop.
2423         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2424           half_work_reg =
2425               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2426                             half_work_reg, 0);
2427           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2428             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2429           }
2430           TFLITE_DCHECK_EQ(copy_done % 4, 0);
2431           vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2432         }
2433 
2434         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2435         // Total amount
2436         // = copy_size - copy_done + 4 - adjusted_residual_width
2437         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2438         // Undone micro blocks
2439         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2440 
2441         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2442         if (copy_done < copy_size) {
2443           TFLITE_DCHECK_LT(copy_remaining, 4);
2444           // Employ overlapping-load strategy in order to load full register,
2445           // but use only part.
2446           // This has the advantage of resulting in zeros after shifting.
2447           half_work_reg = vld1_lane_8x4(
2448               input_block_data + input_block_offset + copy_size - 4,
2449               half_work_reg, 0);
2450 
2451           half_work_reg = vreinterpret_s8_s64(
2452               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2453                        vdup_n_s64(-8 * (4 - copy_remaining))));
2454 
2455           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2456             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2457           }
2458           TFLITE_DCHECK_EQ(copy_done % 4, 0);
2459           vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2460           copy_done += 4;
2461         }
2462         // Trailing guard.
2463         vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2464         vst1_lane_s8x4(scratch_data + copy_done + 4, half_work_reg, 0);
2465         vst1_lane_s8x4(scratch_data + copy_done + 8, half_work_reg, 0);
2466         vst1_lane_s8x4(scratch_data + copy_done + 12, half_work_reg, 0);
2467 
2468         scratch_data_offset += workspace_height_stride;
2469         input_block_offset += input_height_stride;
2470       }
2471     } else {
2472       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2473 
2474       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2475         for (int i = 0; i < copy_size; ++i) {
2476           half_work_reg = vreinterpret_s8_s64(
2477               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2478           half_work_reg = vld1_lane_s8(
2479               reinterpret_cast<const int8*>(
2480                   input_block_data + input_block_offset + copy_size - 1 - i),
2481               half_work_reg, 0);
2482         }
2483 
2484         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2485         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2486         vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2487                        0);
2488 
2489         // Trailing guard.
2490         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2491                        half_work_reg, 0);
2492         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2493                        half_work_reg, 0);
2494         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2495                        half_work_reg, 0);
2496         vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2497                        half_work_reg, 0);
2498 
2499         scratch_data_offset += workspace_height_stride;
2500         input_block_offset += input_height_stride;
2501       }
2502     }
2503 
2504     scratch_data_base += copy_block_height * workspace_height_stride;
2505 
2506     TFLITE_DCHECK_EQ(
2507         scratch_data_base,
2508         scratch_block_data + block_height * workspace_height_stride);
2509   }
2510 
2511   static inline void Run(
2512       int32 height_block_number, int32 width_block_number,
2513       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2514           input_block_data,
2515       int8* scratch_block_data,
2516       const DepthwiseConvDotProdParams* function_params) {
2517 #ifdef __aarch64__
2518     PreloadInputBlock(input_block_data, function_params);
2519 #endif
2520 
2521     PackMacroBlockIntrinsics(height_block_number, width_block_number,
2522                              input_block_data, scratch_block_data,
2523                              function_params);
2524   }
2525 };
2526 
2527 #endif  // ARM NEON
2528 
2529 // Apply filter to macro block of input data and store results.
2530 //
2531 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2532 template <int32 stride, QuantizationType quantization_type>
2533 struct KernelMacroBlock<
2534     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2535     DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2536   // Construct a width-shifted combination of two input sub-blocks, effectively
2537   // concatenating them.
2538   //
2539   // The filter is applied using sub-blocks. These are in the needed form for
2540   // the first (width) offset. For subsequent offsets, the filter is applied to
2541   // shifted and combined data. The concatentation and shifting herein is fairly
2542   // straightforward, but in the optimized code is an area of creativity in
2543   // design because NEON instructions do not directly support the required
2544   // between-register permutation.
2545   //
2546   // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2547   // move along the width for each output point calculation, data is shifted, in
2548   // essence between two such blocks.
2549   //
2550   // selected_data has format height 3, depth 4, width 4.
2551   //
2552   // When the micro block is trailing (the last across the macro-block width),
2553   // it would be illegal to load the right (next) block, and the no_right_block
2554   // indicates this scenario.
2555   static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
2556                                                int workspace_height_stride,
2557                                                int width_micro_stride,
2558                                                bool no_right_block,
2559                                                const int8* input_block,
2560                                                int8 selected_data[3][4][4]) {
2561     TFLITE_DCHECK_GE(offset, 0);
2562     TFLITE_DCHECK_LT(offset, 4);
2563 
2564     // The input banks have same format as selected_data.
2565     int8 left_bank[3][4][4];
2566     int8 right_bank[3][4][4];
2567 
2568     // Work through one slice, by row, at a time.
2569     for (int k_height = 0; k_height < 3; ++k_height) {
2570       // Simulate demangling of mangled storage arrangement.
2571       const int8* left_input_block =
2572           &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
2573       memcpy(left_bank[k_height][0], left_input_block, 16);
2574       if (no_right_block) {
2575         memset(right_bank[k_height][0], 0, 16);
2576       } else {
2577         const int8* right_input_block =
2578             &input_block[k_height * workspace_height_stride +
2579                          sub_block * 2 * 8 + width_micro_stride];
2580         memcpy(right_bank[k_height][0], right_input_block, 16);
2581       }
2582       for (int depth_index = 0; depth_index < 4; ++depth_index) {
2583         memcpy(selected_data[k_height][depth_index],
2584                &left_bank[k_height][depth_index][offset], 4 - offset);
2585         memcpy(&selected_data[k_height][depth_index][4 - offset],
2586                right_bank[k_height][depth_index], offset);
2587       }
2588     }
2589   }
2590 
2591   // Straight implementation of 3x3 filter within sub-micro block.
2592   static inline void Calculate3x3FilterOutput(
2593       const DepthwiseConvDotProdParams& params, int sub_block,
2594       const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
2595       const int32* bias_data, uint8 output_values[4]) {
2596     const int32 output_activation_min = params.quantized_activation_min;
2597     const int32 output_activation_max = params.quantized_activation_max;
2598     const int32 output_multiplier = params.output_multiplier;
2599     const int32 output_shift = params.output_shift;
2600     const int32 output_offset = params.output_offset;
2601     for (int d = 0; d < 4; ++d) {
2602       int32 acc = 0;
2603       for (int y = 0; y < 3; ++y) {
2604         for (int x = 0; x < 4; ++x) {
2605           int32 input_val = selected_data[y][d][x];
2606           int32 filter_val = filter_bank[y][sub_block][d][x];
2607           acc += filter_val * input_val;
2608         }
2609       }
2610       acc += bias_data[d];
2611       acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2612           DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2613                                                 output_shift);
2614       acc += output_offset;
2615       acc = std::max(acc, output_activation_min);
2616       acc = std::min(acc, output_activation_max);
2617       output_values[d] = static_cast<uint8>(acc);
2618     }
2619   }
2620 
2621   static inline void Run(const int8* scratch_block_data,
2622                          const int8* filter_workspace, const int32* bias_data,
2623                          uint8* output_block_data,
2624                          const DepthwiseConvDotProdParams* function_params) {
2625     const int workspace_height_stride =
2626         function_params->workspace_height_stride;
2627     const int input_width_overall_micro_repeats =
2628         function_params->input_width_overall_micro_repeats;
2629     const int output_width_micro_repeats =
2630         function_params->output_width_micro_repeats;
2631     const int depth_micro_repeats = function_params->depth_micro_repeats;
2632     const int depth = function_params->input_depth;
2633     const int stride_val = function_params->stride;
2634     const int four_over_stride = function_params->four_over_stride;
2635 
2636     const int output_width_overall_micro_repeats =
2637         function_params->output_width_overall_micro_repeats;
2638     const int block_height = function_params->outbound_block_height;
2639     const int residual_width = function_params->output_residual_width;
2640     const int output_height_stride = function_params->output_height_stride;
2641     constexpr int bias_increment = 4;
2642     TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2643 
2644     TFLITE_DCHECK(depth_micro_repeats > 0);
2645     const int width_micro_stride = 4 * 8;
2646     const int depth_micro_stride =
2647         width_micro_stride * input_width_overall_micro_repeats;
2648 
2649     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2650 
2651     // Simulate NEON-register transposition of subset of filter.
2652     int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
2653     // Simulate NEON-register input data concatenation + sub-selection.
2654     int8 sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
2655     uint8 output_values[4];                 // Depth 4.
2656 
2657     // The outer 3 loops go through all the micro blocks in a macro block, and
2658     // separately treat the two sub-blocks within each micro block.
2659     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2660       memcpy(filter_bank[0][0][0],
2661              filter_workspace + j_depth * shuffled_filter_increment,
2662              shuffled_filter_increment);
2663 
2664       for (int s = 0; s < 2; ++s) {
2665         for (int k_height = 0; k_height < block_height; ++k_height) {
2666           const int8* scratch_data =
2667               scratch_block_data +
2668               workspace_height_stride * k_height * stride_val +
2669               depth_micro_stride * j_depth;
2670           uint8* output_data =
2671               output_block_data + output_height_stride * k_height + 8 * j_depth;
2672 
2673           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2674                ++i_width) {
2675             const int output_width = i_width == output_width_micro_repeats
2676                                          ? residual_width
2677                                          : four_over_stride;
2678             const bool no_right_block = (output_width - 1) * stride_val < 2;
2679             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2680             const int8* input_data =
2681                 scratch_data + width_micro_stride * i_width;
2682             // Iterate over input width shifts within sub-micro blocks.
2683             for (int x = 0; x < output_width; ++x) {
2684               ConcatenateInputSubBlocks(x * stride_val, s,
2685                                         workspace_height_stride,
2686                                         width_micro_stride, no_right_block,
2687                                         input_data, sub_selected_input_data);
2688               Calculate3x3FilterOutput(
2689                   *function_params, s, sub_selected_input_data, filter_bank,
2690                   bias_data + (2 * j_depth + s) * bias_increment,
2691                   output_values);
2692               for (int d = 0; d < 4; ++d) {
2693                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2694                             d] = output_values[d];
2695               }
2696             }
2697           }
2698         }
2699       }
2700     }
2701   }
2702 };
2703 
2704 // Apply filter to macro block of input data and store results.
2705 //
2706 // Parameters for repeats and residual sizes are in terms of outputs.
2707 //
2708 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2709 template <int32 stride, QuantizationType quantization_type>
2710 struct KernelMacroBlock<
2711     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2712     DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
2713   // Construct a width-shifted combination of two input sub-blocks, effectively
2714   // concatenating them.
2715   //
2716   // The filter is applied using sub-blocks. These are in the needed form for
2717   // the first (width) offset. For subsequent offsets, the filter is applied to
2718   // shifted and combined data. The concatentation and shifting herein is fairly
2719   // straightforward, but in the optimized code is an area of creativity in
2720   // design because NEON instructions do not directly support the required
2721   // between-register permutation.
2722   //
2723   // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2724   // move along the width for each output point calculation, data is shifted, in
2725   // essence between two such blocks.
2726   //
2727   // selected_data has format height 3, width 4.
2728   //
2729   // When the micro block is trailing (the last across the macro-block width),
2730   // it would be illegal to load the right (next) block, and the no_right_block
2731   // indicates this scenario.
2732   static inline void ConcatenateInputSubBlocks(int offset,
2733                                                int workspace_height_stride,
2734                                                bool no_right_block,
2735                                                const int8* input_block,
2736                                                int8 selected_data[3][4]) {
2737     TFLITE_DCHECK_GE(offset, 0);
2738     TFLITE_DCHECK_LT(offset, 4);
2739     if (no_right_block) {
2740       for (int k_height = 0; k_height < 3; ++k_height) {
2741         memcpy(selected_data[k_height],
2742                &input_block[k_height * workspace_height_stride + offset],
2743                4 - offset);
2744       }
2745     } else {
2746       for (int k_height = 0; k_height < 3; ++k_height) {
2747         memcpy(selected_data[k_height],
2748                &input_block[k_height * workspace_height_stride + offset], 4);
2749       }
2750     }
2751   }
2752 
2753   // Straight implementation of 3x3 filter within sub-micro block.
2754   static inline void Calculate3x3FilterOutput(
2755       const DepthwiseConvDotProdParams& function_params, int sub_block,
2756       const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
2757       const int32* bias_data, uint8 output_values[4]) {
2758     const int32 output_activation_min =
2759         function_params.quantized_activation_min;
2760     const int32 output_activation_max =
2761         function_params.quantized_activation_max;
2762     const int32 output_multiplier = function_params.output_multiplier;
2763     const int32 output_shift = function_params.output_shift;
2764     const int32 output_offset = function_params.output_offset;
2765     for (int d = 0; d < 4; ++d) {
2766       int32 acc = 0;
2767       for (int y = 0; y < 3; ++y) {
2768         for (int x = 0; x < 4; ++x) {
2769           int32 input_val = selected_data[y][x];
2770           int32 filter_val = filter_bank[y][sub_block][d][x];
2771           acc += filter_val * input_val;
2772         }
2773       }
2774       acc += bias_data[d];
2775       acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2776           DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2777                                                 output_shift);
2778       acc += output_offset;
2779       acc = std::max(acc, output_activation_min);
2780       acc = std::min(acc, output_activation_max);
2781       output_values[d] = static_cast<uint8>(acc);
2782     }
2783   }
2784 
2785   static inline void Run(const int8* scratch_block_data,
2786                          const int8* filter_workspace, const int32* bias_data,
2787                          uint8* output_block_data,
2788                          const DepthwiseConvDotProdParams* function_params) {
2789     const int workspace_height_stride =
2790         function_params->workspace_height_stride;
2791     const int output_width_micro_repeats =
2792         function_params->output_width_micro_repeats;
2793     const int depth_micro_repeats = function_params->depth_micro_repeats;
2794     const int depth = function_params->output_depth;
2795     const int stride_val = function_params->stride;
2796     const int four_over_stride = function_params->four_over_stride;
2797 
2798     const int workspace_width_micro_repeats =
2799         function_params->workspace_width_micro_repeats;
2800     const int output_width_overall_micro_repeats =
2801         function_params->output_width_overall_micro_repeats;
2802     const int block_height = function_params->outbound_block_height;
2803     const int residual_width = function_params->output_residual_width;
2804     const int output_height_stride = function_params->output_height_stride;
2805     constexpr int bias_increment = 4;
2806     TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2807 
2808     TFLITE_DCHECK(depth_micro_repeats > 0);
2809 
2810     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2811 
2812     // Simulate NEON-register transposition of subset of filter.
2813     int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
2814     // Simulate NEON-register input data concatenation + sub-selection.
2815     int8 sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
2816     uint8 output_values[4];              // Depth 4.
2817 
2818     // The outer 3 loops go through all the micro blocks in a macro block, and
2819     // separately treat the two sub-blocks within each micro block.
2820     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2821       memcpy(filter_bank[0][0][0],
2822              filter_workspace + j_depth * shuffled_filter_increment,
2823              shuffled_filter_increment);
2824 
2825       for (int s = 0; s < 2; ++s) {
2826         for (int k_height = 0; k_height < block_height; ++k_height) {
2827           const int8* scratch_data =
2828               scratch_block_data +
2829               workspace_height_stride * k_height * stride_val;
2830           uint8* output_data =
2831               output_block_data + output_height_stride * k_height + 8 * j_depth;
2832 
2833           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2834                ++i_width) {
2835             const int output_width = i_width == output_width_micro_repeats
2836                                          ? residual_width
2837                                          : four_over_stride;
2838             const bool no_right_block = i_width == output_width_micro_repeats &&
2839                                         output_width_overall_micro_repeats ==
2840                                             workspace_width_micro_repeats;
2841             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2842             const int8* input_data = scratch_data + 4 * i_width;
2843             // Iterate over input width shifts within 4x4 blocks.
2844             for (int x = 0; x < output_width; ++x) {
2845               ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
2846                                         no_right_block, input_data,
2847                                         sub_selected_input_data);
2848               Calculate3x3FilterOutput(
2849                   *function_params, s, sub_selected_input_data, filter_bank,
2850                   bias_data + (2 * j_depth + s) * bias_increment,
2851                   output_values);
2852               for (int d = 0; d < 4; ++d) {
2853                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2854                             d] = output_values[d];
2855               }
2856             }
2857           }
2858         }
2859       }
2860     }
2861   }
2862 };
2863 
2864 // Beginning of code section containing intermediate code transformation.
2865 //
2866 // This section is only compiled when kUseUnwound3x3DotProduct versions of
2867 // templated functions are selected.
2868 template <int32 stride, QuantizationType quantization_type>
2869 struct KernelMacroBlock<
2870     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
2871     DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2872   static inline void Run(const int8* scratch_block_data,
2873                          const int8* filter_workspace, const int32* bias_data,
2874                          uint8* output_block_data,
2875                          const DepthwiseConvDotProdParams* function_params) {
2876     const int workspace_height_stride =
2877         function_params->workspace_height_stride;
2878     const int input_width_overall_micro_repeats =
2879         function_params->input_width_overall_micro_repeats;
2880     const int output_width_micro_repeats =
2881         function_params->output_width_micro_repeats;
2882     const int depth_micro_repeats = function_params->depth_micro_repeats;
2883     const int depth = function_params->input_depth;
2884     const int stride_val = function_params->stride;
2885     const int four_over_stride = function_params->four_over_stride;
2886 
2887     const int output_width_overall_micro_repeats =
2888         function_params->output_width_overall_micro_repeats;
2889     const int block_height = function_params->outbound_block_height;
2890     const int residual_width = function_params->output_residual_width;
2891     const int output_height_stride = function_params->output_height_stride;
2892     const int bias_increment = function_params->bias_increment;
2893 
2894     TFLITE_DCHECK(depth_micro_repeats > 0);
2895     const int width_micro_stride = 4 * 8;
2896     const int depth_micro_stride =
2897         width_micro_stride * input_width_overall_micro_repeats;
2898 
2899     const int32 output_activation_min =
2900         function_params->quantized_activation_min;
2901     const int32 output_activation_max =
2902         function_params->quantized_activation_max;
2903     const int32 output_multiplier = function_params->output_multiplier;
2904     const int32 output_shift = function_params->output_shift;
2905     const int32 output_offset = function_params->output_offset;
2906 
2907     // Simulate NEON-register transposition of subset of filter.
2908     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
2909     int8 filter_bank_a_1[4][4];
2910     int8 filter_bank_a_2[4][4];
2911     int8 filter_bank_b_0[4][4];
2912     int8 filter_bank_b_1[4][4];
2913     int8 filter_bank_b_2[4][4];
2914     // Simulate NEON-register input data concatenation + sub-selection.
2915     // Also sub-block, height 3, depth 4, width 4.
2916     uint8 output_values[4];  // Sub-block, depth 4.
2917     // selected_data has format Depth 4, width 4.
2918     int8 left_bank_0[4][4];
2919     int8 left_bank_1[4][4];
2920     int8 left_bank_2[4][4];
2921     int8 right_bank_0[4][4];
2922     int8 right_bank_1[4][4];
2923     int8 right_bank_2[4][4];
2924     memset(right_bank_0[0], 0, 16);
2925     memset(right_bank_1[0], 0, 16);
2926     memset(right_bank_2[0], 0, 16);
2927 
2928     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2929 
2930     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2931       const int8* filter_block =
2932           filter_workspace + shuffled_filter_increment * j_depth;
2933 
2934       memcpy(filter_bank_a_0, filter_block, 16);
2935       memcpy(filter_bank_b_0, filter_block + 16, 16);
2936       memcpy(filter_bank_a_1, filter_block + 32, 16);
2937       memcpy(filter_bank_b_1, filter_block + 48, 16);
2938       memcpy(filter_bank_a_2, filter_block + 64, 16);
2939       memcpy(filter_bank_b_2, filter_block + 80, 16);
2940 
2941       for (int s = 0; s < 2; ++s) {
2942         // Work through one slice, by row, at a time.
2943         for (int k_height = 0; k_height < block_height; ++k_height) {
2944           const int8* scratch_data =
2945               scratch_block_data +
2946               workspace_height_stride * k_height * stride_val +
2947               depth_micro_stride * j_depth;
2948           uint8* output_data =
2949               output_block_data + output_height_stride * k_height + 8 * j_depth;
2950           const int8* input_data_0 = scratch_data + s * 2 * 8;
2951 
2952           // Load first sub-micro block of data into operational banks.
2953           memcpy(left_bank_0[0], input_data_0, 16);
2954           memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
2955           memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
2956                  16);
2957 
2958           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2959                ++i_width) {
2960             const int output_width = i_width == output_width_micro_repeats
2961                                          ? residual_width
2962                                          : four_over_stride;
2963             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2964             const int8* input_data =
2965                 input_data_0 + width_micro_stride * i_width;
2966             const bool no_right_block = (output_width - 1) * stride_val < 2;
2967 
2968             // Load next sub-micro block of data.
2969             if (!no_right_block) {
2970               memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
2971               memcpy(right_bank_1[0],
2972                      input_data + workspace_height_stride + width_micro_stride,
2973                      16);
2974               memcpy(
2975                   right_bank_2[0],
2976                   input_data + 2 * workspace_height_stride + width_micro_stride,
2977                   16);
2978             }
2979 
2980             // Iterate over input width shifts within 4x4 blocks.
2981             for (int x = 0; x < output_width; ++x) {
2982               // Operate on depth of 4 in batches.
2983               for (int d = 0; d < 4; ++d) {
2984                 int32 acc = 0;
2985                 for (int x = 0; x < 4; ++x) {
2986                   int32 input_val = left_bank_0[d][x];
2987                   int32 filter_val = filter_bank_a_0[d][x];
2988                   acc += filter_val * input_val;
2989                 }
2990                 for (int x = 0; x < 4; ++x) {
2991                   int32 input_val = left_bank_1[d][x];
2992                   int32 filter_val = filter_bank_a_1[d][x];
2993                   acc += filter_val * input_val;
2994                 }
2995                 for (int x = 0; x < 4; ++x) {
2996                   int32 input_val = left_bank_2[d][x];
2997                   int32 filter_val = filter_bank_a_2[d][x];
2998                   acc += filter_val * input_val;
2999                 }
3000                 acc += bias_data[d];
3001                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3002                     DepthwiseConvOutputRounding::kUpward>(
3003                     acc, output_multiplier, output_shift);
3004                 acc += output_offset;
3005                 acc = std::max(acc, output_activation_min);
3006                 acc = std::min(acc, output_activation_max);
3007                 output_values[d] = static_cast<uint8>(acc);
3008               }
3009 
3010               for (int d = 0; d < 4; ++d) {
3011                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
3012                             d] = output_values[d];
3013               }
3014 
3015               // Simulate shifting instructions.
3016               if (stride_val == 1) {
3017                 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3018                   for (int z = 0; z < 3; ++z) {
3019                     left_bank_0[depth_index][z] =
3020                         left_bank_0[depth_index][z + 1];
3021                     left_bank_1[depth_index][z] =
3022                         left_bank_1[depth_index][z + 1];
3023                     left_bank_2[depth_index][z] =
3024                         left_bank_2[depth_index][z + 1];
3025                   }
3026                   left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
3027                   left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
3028                   left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
3029                   for (int z = 0; z < 3; ++z) {
3030                     right_bank_0[depth_index][z] =
3031                         right_bank_0[depth_index][z + 1];
3032                     right_bank_1[depth_index][z] =
3033                         right_bank_1[depth_index][z + 1];
3034                     right_bank_2[depth_index][z] =
3035                         right_bank_2[depth_index][z + 1];
3036                   }
3037                 }
3038               } else {
3039                 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3040                   for (int z = 0; z < 2; ++z) {
3041                     left_bank_0[depth_index][z] =
3042                         left_bank_0[depth_index][z + 2];
3043                     left_bank_1[depth_index][z] =
3044                         left_bank_1[depth_index][z + 2];
3045                     left_bank_2[depth_index][z] =
3046                         left_bank_2[depth_index][z + 2];
3047                   }
3048                   left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
3049                   left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
3050                   left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
3051                   left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
3052                   left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
3053                   left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
3054                   for (int z = 0; z < 2; ++z) {
3055                     right_bank_0[depth_index][z] =
3056                         right_bank_0[depth_index][z + 2];
3057                     right_bank_1[depth_index][z] =
3058                         right_bank_1[depth_index][z + 2];
3059                     right_bank_2[depth_index][z] =
3060                         right_bank_2[depth_index][z + 2];
3061                   }
3062                 }
3063               }
3064             }
3065           }
3066         }
3067         bias_data += bias_increment;
3068 
3069         // Move filter for second sub-block into operational filter.
3070         for (int z = 0; z < 4; ++z) {
3071           for (int x = 0; x < 4; ++x) {
3072             filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
3073             filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
3074             filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
3075           }
3076         }
3077       }
3078     }
3079   }
3080 };
3081 
3082 template <int32 stride, QuantizationType quantization_type>
3083 struct KernelMacroBlock<
3084     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
3085     DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
3086   static inline void Run(const int8* scratch_block_data,
3087                          const int8* filter_workspace, const int32* bias_data,
3088                          uint8* output_block_data,
3089                          const DepthwiseConvDotProdParams* function_params) {
3090     const int workspace_height_stride =
3091         function_params->workspace_height_stride;
3092     const int output_width_micro_repeats =
3093         function_params->output_width_micro_repeats;
3094     const int depth_micro_repeats = function_params->depth_micro_repeats;
3095     const int output_depth = function_params->output_depth;
3096     const int stride_val = function_params->stride;
3097     const int four_over_stride = function_params->four_over_stride;
3098 
3099     const int output_width_overall_micro_repeats =
3100         function_params->output_width_overall_micro_repeats;
3101     const int block_height = function_params->outbound_block_height;
3102     const int residual_width = function_params->output_residual_width;
3103     const int output_height_stride = function_params->output_height_stride;
3104     const int bias_increment = function_params->bias_increment;
3105 
3106     const int32 output_activation_min =
3107         function_params->quantized_activation_min;
3108     const int32 output_activation_max =
3109         function_params->quantized_activation_max;
3110     const int32 output_multiplier = function_params->output_multiplier;
3111     const int32 output_shift = function_params->output_shift;
3112     const int32 output_offset = function_params->output_offset;
3113 
3114     TFLITE_DCHECK(depth_micro_repeats > 0);
3115 
3116     TFLITE_DCHECK_EQ(bias_increment, 4);
3117 
3118     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3119 
3120     // Simulate NEON-register transposition of subset of filter.
3121     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
3122     int8 filter_bank_a_1[4][4];
3123     int8 filter_bank_a_2[4][4];
3124     int8 filter_bank_b_0[4][4];
3125     int8 filter_bank_b_1[4][4];
3126     int8 filter_bank_b_2[4][4];
3127     // Simulate NEON-register input data concatenation + sub-selection.
3128     // Also sub-block, height 3, depth 4, width 4.
3129 
3130     int8 input_bank_0[8];
3131     int8 input_bank_1[8];
3132     int8 input_bank_2[8];
3133 
3134     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
3135 
3136     uint8 output_values[2][4];  // Sub-block, depth 4.
3137 
3138     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3139       memcpy(filter_bank_a_0, filter_workspace, 16);
3140       memcpy(filter_bank_b_0, filter_workspace + 16, 16);
3141       memcpy(filter_bank_a_1, filter_workspace + 32, 16);
3142       memcpy(filter_bank_b_1, filter_workspace + 48, 16);
3143       memcpy(filter_bank_a_2, filter_workspace + 64, 16);
3144       memcpy(filter_bank_b_2, filter_workspace + 80, 16);
3145 
3146       // Work through one slice, by row, at a time.
3147       for (int k_height = 0; k_height < block_height; ++k_height) {
3148         const int8* scratch_data =
3149             scratch_block_data +
3150             workspace_height_stride * k_height * stride_val;
3151         uint8* output_data =
3152             output_block_data + output_height_stride * k_height + 8 * j_depth;
3153 
3154         memcpy(input_bank_0, scratch_data, 4);
3155         memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
3156         memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
3157 
3158         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3159              ++i_width) {
3160           const int output_width = i_width == output_width_micro_repeats
3161                                        ? residual_width
3162                                        : four_over_stride;
3163 
3164           TFLITE_DCHECK_LE(output_width * stride_val, 4);
3165           const int8* input_data = scratch_data + 4 * i_width;
3166 
3167           memcpy(input_bank_0 + 4, input_data + 4, 4);
3168           memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
3169           memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
3170                  4);
3171 
3172           // Iterate over input width shifts within 4x4 blocks.
3173           for (int w = 0; w < output_width; ++w) {
3174             constexpr int offset =
3175                 0;  // Shift input instead of offset in multiply-accumulate.
3176 
3177             {
3178               const int s = 0;
3179               for (int d = 0; d < 4; ++d) {
3180                 int32 acc = bias_data[s * 4 + d];
3181                 for (int x = 0; x < 4; ++x) {
3182                   int32 input_val_0 = input_bank_0[offset + x];
3183                   int32 filter_val_0 = filter_bank_a_0[d][x];
3184                   acc += filter_val_0 * input_val_0;
3185                   int32 input_val_1 = input_bank_1[offset + x];
3186                   int32 filter_val_1 = filter_bank_a_1[d][x];
3187                   acc += filter_val_1 * input_val_1;
3188                   int32 input_val_2 = input_bank_2[offset + x];
3189                   int32 filter_val_2 = filter_bank_a_2[d][x];
3190                   acc += filter_val_2 * input_val_2;
3191                 }
3192                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3193                     DepthwiseConvOutputRounding::kUpward>(
3194                     acc, output_multiplier, output_shift);
3195                 acc += output_offset;
3196                 acc = std::max(acc, output_activation_min);
3197                 acc = std::min(acc, output_activation_max);
3198                 output_values[s][d] = static_cast<uint8>(acc);
3199 
3200                 output_data[s * 4 + d] = output_values[s][d];
3201               }
3202             }
3203             {
3204               const int s = 1;
3205               for (int d = 0; d < 4; ++d) {
3206                 int32 acc = bias_data[s * 4 + d];
3207                 for (int x = 0; x < 4; ++x) {
3208                   int32 input_val_0 = input_bank_0[offset + x];
3209                   int32 filter_val_0 = filter_bank_b_0[d][x];
3210                   acc += filter_val_0 * input_val_0;
3211                   int32 input_val_1 = input_bank_1[offset + x];
3212                   int32 filter_val_1 = filter_bank_b_1[d][x];
3213                   acc += filter_val_1 * input_val_1;
3214                   int32 input_val_2 = input_bank_2[offset + x];
3215                   int32 filter_val_2 = filter_bank_b_2[d][x];
3216                   acc += filter_val_2 * input_val_2;
3217                 }
3218                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3219                     DepthwiseConvOutputRounding::kUpward>(
3220                     acc, output_multiplier, output_shift);
3221                 acc += output_offset;
3222                 acc = std::max(acc, output_activation_min);
3223                 acc = std::min(acc, output_activation_max);
3224                 output_values[s][d] = static_cast<uint8>(acc);
3225 
3226                 output_data[s * 4 + d] = output_values[s][d];
3227               }
3228             }
3229 
3230             // Simulate register shifts.
3231             for (int i = 0; i < (8 - stride_val); ++i) {
3232               input_bank_0[i] = input_bank_0[i + stride_val];
3233               input_bank_1[i] = input_bank_1[i + stride_val];
3234               input_bank_2[i] = input_bank_2[i + stride_val];
3235             }
3236 
3237             output_data += output_depth;
3238           }
3239         }
3240       }
3241       bias_data += 2 * bias_increment;
3242       filter_workspace += shuffled_filter_increment;
3243     }
3244   }
3245 };
3246 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
3247 // of templated functions are selected.
3248 //
3249 // End of code section containing intermediate code transformation.
3250 
3251 #ifdef USE_NEON
3252 template <>
3253 struct KernelMacroBlock<
3254     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3255     QuantizationType::kNonPerChannelUint8,
3256     DepthwiseConvDepthMultiplication::kNoMultiplication,
3257     /*stride=*/1> {
3258   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3259   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3260     return vmin_u8(a, b);
3261   }
3262   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3263     return vmax_u8(a, b);
3264   }
3265   static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
3266     return vminq_u8(a, b);
3267   }
3268   static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
3269     return vmaxq_u8(a, b);
3270   }
3271 
3272   static inline void KernelMacroBlockIntrinsics(
3273       const int8* scratch_block_data, const int8* filter_workspace,
3274       const int32* bias_data, uint8* output_block_data,
3275       const DepthwiseConvDotProdParams* function_params) {
3276     static constexpr QuantizationType quantization_type =
3277         QuantizationType::kNonPerChannelUint8;
3278 
3279     const int workspace_height_stride =
3280         function_params->workspace_height_stride;
3281     const int input_width_overall_micro_repeats =
3282         function_params->input_width_overall_micro_repeats;
3283     const int output_width_micro_repeats =
3284         function_params->output_width_micro_repeats;
3285     const int depth_micro_repeats = function_params->depth_micro_repeats;
3286     const int depth = function_params->input_depth;
3287 
3288     const int output_width_overall_micro_repeats =
3289         function_params->output_width_overall_micro_repeats;
3290     const int block_height = function_params->outbound_block_height;
3291     const int residual_width = function_params->output_residual_width;
3292     const int output_height_stride = function_params->output_height_stride;
3293     constexpr int kBiasIncrement = 4;
3294 
3295     TFLITE_DCHECK(depth_micro_repeats > 0);
3296     const int width_micro_stride = 4 * 8;
3297     const int depth_micro_stride =
3298         width_micro_stride * input_width_overall_micro_repeats;
3299 
3300     const int32 output_activation_min =
3301         function_params->quantized_activation_min;
3302     const int32 output_activation_max =
3303         function_params->quantized_activation_max;
3304     const int32 output_multiplier = function_params->output_multiplier;
3305     const int32 output_shift = function_params->output_shift;
3306     const int32 output_offset = function_params->output_offset;
3307     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3308       TFLITE_DCHECK_GE(output_activation_min, 0);
3309       TFLITE_DCHECK_LT(output_activation_min, 256);
3310       TFLITE_DCHECK_GE(output_activation_max, 0);
3311       TFLITE_DCHECK_LT(output_activation_max, 256);
3312     } else {
3313       TFLITE_DCHECK_GE(output_activation_min, -128);
3314       TFLITE_DCHECK_LT(output_activation_min, 128);
3315       TFLITE_DCHECK_GE(output_activation_max, -128);
3316       TFLITE_DCHECK_LT(output_activation_max, 128);
3317     }
3318     TFLITE_DCHECK_GE(output_offset, -32878);
3319     TFLITE_DCHECK_LT(output_offset, 32768);
3320 
3321     const int16x8_t output_offset_vec =
3322         vdupq_n_s16(static_cast<int16>(output_offset));
3323     const uint8x16_t output_activation_min_vec =
3324         vdupq_n_u8(static_cast<uint8>(output_activation_min));
3325     const uint8x16_t output_activation_max_vec =
3326         vdupq_n_u8(static_cast<uint8>(output_activation_max));
3327 
3328     const int8* input_data_depthwise = scratch_block_data;
3329     typename QuantizationTypeImpl<quantization_type>::ExternalType*
3330         output_data_depthwise = output_block_data;
3331     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3332       // Simulate NEON-register transposition of subset of filter.
3333       int8x16_t filter_reg_0_a;
3334       int8x16_t filter_reg_0_b;
3335       int8x16_t filter_reg_1_a;
3336       int8x16_t filter_reg_1_b;
3337       int8x16_t filter_reg_2_a;
3338       int8x16_t filter_reg_2_b;
3339       int8x16_t filter_reg_0_a_shifted;
3340       int8x16_t filter_reg_1_a_shifted;
3341       int8x16_t filter_reg_2_a_shifted;
3342 
3343       filter_reg_0_a = vld1q_s8(filter_workspace);
3344       filter_workspace += 16;
3345       filter_reg_0_b = vld1q_s8(filter_workspace);
3346       filter_workspace += 16;
3347       filter_reg_1_a = vld1q_s8(filter_workspace);
3348       filter_workspace += 16;
3349       filter_reg_1_b = vld1q_s8(filter_workspace);
3350       filter_workspace += 16;
3351       filter_reg_2_a = vld1q_s8(filter_workspace);
3352       filter_workspace += 16;
3353       filter_reg_2_b = vld1q_s8(filter_workspace);
3354       filter_workspace += 16;
3355 
3356       filter_reg_0_a_shifted = vreinterpretq_s8_u32(
3357           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
3358       filter_reg_1_a_shifted = vreinterpretq_s8_u32(
3359           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
3360       filter_reg_2_a_shifted = vreinterpretq_s8_u32(
3361           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
3362 
3363       if (block_height == 4) {
3364         for (int s = 0; s < 2; ++s) {
3365           // Work through one slice, by row, at a time.
3366           const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
3367           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3368               output_data_base = output_data_depthwise + 4 * s;
3369 
3370           const int8* next_input_data = input_data_base;
3371           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3372               output_data = output_data_base;
3373 
3374           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3375           bias_data += kBiasIncrement;
3376 
3377           // Load first sub-micro block of data into operational banks.
3378           int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
3379           int8x16_t left_bank_1_reg =
3380               vld1q_s8(next_input_data + workspace_height_stride);
3381           int8x16_t left_bank_2_reg =
3382               vld1q_s8(next_input_data + 2 * workspace_height_stride);
3383           int8x16_t left_bank_3_reg =
3384               vld1q_s8(next_input_data + 3 * workspace_height_stride);
3385           int8x16_t left_bank_4_reg =
3386               vld1q_s8(next_input_data + 4 * workspace_height_stride);
3387           int8x16_t left_bank_5_reg =
3388               vld1q_s8(next_input_data + 5 * workspace_height_stride);
3389 
3390           int32x4_t acc0;
3391           int32x4_t acc1;
3392           int32x4_t acc2;
3393           int32x4_t acc3;
3394 
3395           acc0 = adjusted_bias_data;
3396           acc1 = adjusted_bias_data;
3397           acc2 = adjusted_bias_data;
3398           acc3 = adjusted_bias_data;
3399 
3400           acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3401           acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3402           acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3403           acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3404 
3405           for (int i_width = 0; i_width < output_width_micro_repeats;
3406                ++i_width) {
3407             next_input_data += width_micro_stride;
3408 
3409             // Iterate over input width shifts within 4x4 blocks.
3410             {
3411               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3412               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3413               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3414               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3415               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3416               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3417               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3418               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3419 
3420               // Fixed-point multiplication.
3421               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3422               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3423                   acc0, -output_shift);
3424               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3425               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3426                   acc1, -output_shift);
3427               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3428               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3429                   acc2, -output_shift);
3430               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3431               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3432                   acc3, -output_shift);
3433               // Add the output offset.
3434               int16x8_t acc_s16_0_1 =
3435                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3436               int16x8_t acc_s16_2_3 =
3437                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3438               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3439               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3440               // Apply the activation function.
3441               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3442                                                   vqmovxn_s16(acc_s16_2_3));
3443               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3444               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3445 
3446               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3447               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3448                               1);
3449               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3450                               acc_u8_all, 2);
3451               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3452                               acc_u8_all, 3);
3453 
3454               output_data += depth;
3455             }
3456 
3457             // Load next sub-micro block of data.
3458             int8x16_t right_bank_0_reg;
3459             int8x16_t right_bank_1_reg;
3460             int8x16_t right_bank_2_reg;
3461             int8x16_t right_bank_3_reg;
3462             int8x16_t right_bank_4_reg;
3463             int8x16_t right_bank_5_reg;
3464 
3465             // Loading of next block always valid.
3466             right_bank_0_reg = vld1q_s8(next_input_data);
3467             right_bank_1_reg =
3468                 vld1q_s8(next_input_data + workspace_height_stride);
3469             right_bank_2_reg =
3470                 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3471             right_bank_3_reg =
3472                 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3473             right_bank_4_reg =
3474                 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3475             right_bank_5_reg =
3476                 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3477 
3478             {
3479               acc0 = adjusted_bias_data;
3480               acc1 = adjusted_bias_data;
3481               acc2 = adjusted_bias_data;
3482               acc3 = adjusted_bias_data;
3483 
3484               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3485               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3486               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3487               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3488               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3489               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3490               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3491               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3492               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3493               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3494               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3495               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3496 
3497               // Fixed-point multiplication.
3498               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3499               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3500                   acc0, -output_shift);
3501               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3502               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3503                   acc1, -output_shift);
3504               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3505               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3506                   acc2, -output_shift);
3507               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3508               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3509                   acc3, -output_shift);
3510               // Add the output offset.
3511               int16x8_t acc_s16_0_1 =
3512                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3513               int16x8_t acc_s16_2_3 =
3514                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3515               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3516               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3517               // Apply the activation function.
3518               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3519                                                   vqmovxn_s16(acc_s16_2_3));
3520               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3521               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3522 
3523               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3524               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3525                               1);
3526               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3527                               acc_u8_all, 2);
3528               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3529                               acc_u8_all, 3);
3530 
3531               left_bank_0_reg = vreinterpretq_s8_u16(
3532                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
3533               left_bank_1_reg = vreinterpretq_s8_u16(
3534                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
3535               left_bank_2_reg = vreinterpretq_s8_u16(
3536                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
3537               left_bank_3_reg = vreinterpretq_s8_u16(
3538                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
3539               left_bank_4_reg = vreinterpretq_s8_u16(
3540                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
3541               left_bank_5_reg = vreinterpretq_s8_u16(
3542                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
3543               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
3544               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
3545               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
3546               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
3547               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
3548               vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
3549 
3550               output_data += depth;
3551             }
3552 
3553             {
3554               acc0 = adjusted_bias_data;
3555               acc1 = adjusted_bias_data;
3556               acc2 = adjusted_bias_data;
3557               acc3 = adjusted_bias_data;
3558 
3559               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3560               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3561               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3562               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3563               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3564               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3565               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3566               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3567               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3568               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3569               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3570               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3571 
3572               // Fixed-point multiplication.
3573               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3574               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3575                   acc0, -output_shift);
3576               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3577               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3578                   acc1, -output_shift);
3579               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3580               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3581                   acc2, -output_shift);
3582               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3583               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3584                   acc3, -output_shift);
3585               // Add the output offset.
3586               int16x8_t acc_s16_0_1 =
3587                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3588               int16x8_t acc_s16_2_3 =
3589                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3590               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3591               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3592               // Apply the activation function.
3593               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3594                                                   vqmovxn_s16(acc_s16_2_3));
3595               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3596               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3597 
3598               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3599               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3600                               1);
3601               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3602                               acc_u8_all, 2);
3603               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3604                               acc_u8_all, 3);
3605 
3606               output_data += depth;
3607             }
3608 
3609             {
3610               acc0 = adjusted_bias_data;
3611               acc1 = adjusted_bias_data;
3612               acc2 = adjusted_bias_data;
3613               acc3 = adjusted_bias_data;
3614 
3615               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3616               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3617               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3618               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3619               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3620               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3621               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3622               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3623               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3624               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3625               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3626               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3627 
3628               // Fixed-point multiplication.
3629               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3630               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3631                   acc0, -output_shift);
3632               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3633               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3634                   acc1, -output_shift);
3635               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3636               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3637                   acc2, -output_shift);
3638               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3639               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3640                   acc3, -output_shift);
3641               // Add the output offset.
3642               int16x8_t acc_s16_0_1 =
3643                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3644               int16x8_t acc_s16_2_3 =
3645                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3646               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3647               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3648               // Apply the activation function.
3649               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3650                                                   vqmovxn_s16(acc_s16_2_3));
3651               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3652               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3653 
3654               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3655               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3656                               1);
3657               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3658                               acc_u8_all, 2);
3659               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3660                               acc_u8_all, 3);
3661 
3662               left_bank_0_reg = right_bank_0_reg;
3663               left_bank_1_reg = right_bank_1_reg;
3664               left_bank_2_reg = right_bank_2_reg;
3665               left_bank_3_reg = right_bank_3_reg;
3666               left_bank_4_reg = right_bank_4_reg;
3667               left_bank_5_reg = right_bank_5_reg;
3668 
3669               output_data += depth;
3670               acc0 = adjusted_bias_data;
3671               acc1 = adjusted_bias_data;
3672               acc2 = adjusted_bias_data;
3673               acc3 = adjusted_bias_data;
3674 
3675               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3676               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3677               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3678               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3679             }
3680           }
3681 
3682           if (residual_width > 0) {
3683             next_input_data += width_micro_stride;
3684             const int output_width = residual_width;
3685 
3686             // Load next sub-micro block of data.
3687             int8x16_t right_bank_0_reg;
3688             int8x16_t right_bank_1_reg;
3689             int8x16_t right_bank_2_reg;
3690             int8x16_t right_bank_3_reg;
3691             int8x16_t right_bank_4_reg;
3692             int8x16_t right_bank_5_reg;
3693             // Logic: (output_width - 1) * stride_val < 2.
3694             const bool no_right_block = output_width < 3;
3695 
3696             if (no_right_block) {
3697               // Only needed for sanitizer checks.
3698               right_bank_0_reg = vdupq_n_s8(0);
3699               right_bank_1_reg = vdupq_n_s8(0);
3700               right_bank_2_reg = vdupq_n_s8(0);
3701               right_bank_3_reg = vdupq_n_s8(0);
3702               right_bank_4_reg = vdupq_n_s8(0);
3703               right_bank_5_reg = vdupq_n_s8(0);
3704             } else {
3705               right_bank_0_reg = vld1q_s8(next_input_data);
3706               right_bank_1_reg =
3707                   vld1q_s8(next_input_data + workspace_height_stride);
3708               right_bank_2_reg =
3709                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
3710               right_bank_3_reg =
3711                   vld1q_s8(next_input_data + 3 * workspace_height_stride);
3712               right_bank_4_reg =
3713                   vld1q_s8(next_input_data + 4 * workspace_height_stride);
3714               right_bank_5_reg =
3715                   vld1q_s8(next_input_data + 5 * workspace_height_stride);
3716             }
3717 
3718             // Iterate over input width shifts within 4x4 blocks.
3719             for (int x = 0; x < output_width; ++x) {
3720               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3721               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3722               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3723               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3724               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3725               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3726               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3727               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3728 
3729               // Fixed-point multiplication.
3730               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3731               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3732                   acc0, -output_shift);
3733               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3734               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3735                   acc1, -output_shift);
3736               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3737               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3738                   acc2, -output_shift);
3739               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3740               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3741                   acc3, -output_shift);
3742               // Add the output offset.
3743               int16x8_t acc_s16_0_1 =
3744                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3745               int16x8_t acc_s16_2_3 =
3746                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3747               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3748               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3749               // Apply the activation function.
3750               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3751                                                   vqmovxn_s16(acc_s16_2_3));
3752               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3753               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3754 
3755               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3756               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3757                               1);
3758               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3759                               acc_u8_all, 2);
3760               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3761                               acc_u8_all, 3);
3762 
3763               biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
3764               biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
3765               biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
3766               biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
3767               biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
3768               biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
3769 
3770               output_data += depth;
3771 
3772               acc0 = adjusted_bias_data;
3773               acc1 = adjusted_bias_data;
3774               acc2 = adjusted_bias_data;
3775               acc3 = adjusted_bias_data;
3776 
3777               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3778               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3779               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3780               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3781             }
3782           }
3783           input_data_base += 4 * workspace_height_stride;
3784           output_data_base += 4 * output_height_stride;
3785 
3786           // Move to next sub-block: advance to second set of filters, to new
3787           // bias.
3788           filter_reg_0_a = filter_reg_0_b;
3789           filter_reg_1_a = filter_reg_1_b;
3790           filter_reg_2_a = filter_reg_2_b;
3791           filter_reg_0_a_shifted = vreinterpretq_s8_u32(
3792               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
3793           filter_reg_1_a_shifted = vreinterpretq_s8_u32(
3794               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
3795           filter_reg_2_a_shifted = vreinterpretq_s8_u32(
3796               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
3797         }
3798       } else {
3799         const int8* input_data_base = input_data_depthwise;
3800         typename QuantizationTypeImpl<quantization_type>::ExternalType*
3801             output_data_base = output_data_depthwise;
3802 
3803         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
3804         bias_data += kBiasIncrement;
3805         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
3806         bias_data += kBiasIncrement;
3807 
3808         for (int k_height = 0; k_height < block_height; ++k_height) {
3809           const int8* next_input_data = input_data_base;
3810           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3811               output_data = output_data_base;
3812 
3813           // Load first sub-micro block of data into operational banks.
3814           int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
3815           int8x16_t left_bank_1_reg_a =
3816               vld1q_s8(next_input_data + workspace_height_stride);
3817           int8x16_t left_bank_2_reg_a =
3818               vld1q_s8(next_input_data + 2 * workspace_height_stride);
3819           int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3820           int8x16_t left_bank_1_reg_b =
3821               vld1q_s8(next_input_data + workspace_height_stride + 16);
3822           int8x16_t left_bank_2_reg_b =
3823               vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3824 
3825           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3826                ++i_width) {
3827             next_input_data += width_micro_stride;
3828             const int output_width =
3829                 i_width == output_width_micro_repeats ? residual_width : 4;
3830 
3831             int8x16_t right_bank_0_reg_a;
3832             int8x16_t right_bank_1_reg_a;
3833             int8x16_t right_bank_2_reg_a;
3834             int8x16_t right_bank_0_reg_b;
3835             int8x16_t right_bank_1_reg_b;
3836             int8x16_t right_bank_2_reg_b;
3837             // Logic: (output_width - 1) * stride_val < 2.
3838             const bool no_right_block = output_width < 3;
3839 
3840             // Load next sub-micro block of data.
3841             if (no_right_block) {
3842               // Only needed for sanitizer checks.
3843               right_bank_0_reg_a = vdupq_n_s8(0);
3844               right_bank_1_reg_a = vdupq_n_s8(0);
3845               right_bank_2_reg_a = vdupq_n_s8(0);
3846               right_bank_0_reg_b = vdupq_n_s8(0);
3847               right_bank_1_reg_b = vdupq_n_s8(0);
3848               right_bank_2_reg_b = vdupq_n_s8(0);
3849             } else {
3850               right_bank_0_reg_a = vld1q_s8(next_input_data);
3851               right_bank_1_reg_a =
3852                   vld1q_s8(next_input_data + workspace_height_stride);
3853               right_bank_2_reg_a =
3854                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
3855               right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3856               right_bank_1_reg_b =
3857                   vld1q_s8(next_input_data + workspace_height_stride + 16);
3858               right_bank_2_reg_b =
3859                   vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3860             }
3861 
3862             // Iterate over input width shifts within 4x4 blocks.
3863             for (int x = 0; x < output_width; ++x) {
3864               int32x4_t acc_a = adjusted_bias_data_a;
3865               int32x4_t acc_b = adjusted_bias_data_b;
3866               acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
3867               acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
3868               acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
3869               acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
3870               acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
3871               acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
3872 
3873               // Fixed-point multiplication.
3874               acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
3875               acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
3876               acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3877                   acc_a, -output_shift);
3878               acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3879                   acc_b, -output_shift);
3880               // Add the output offset.
3881               int16x8_t acc_s16_0_0 =
3882                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
3883               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
3884               // Apply the activation function.
3885               uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
3886               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
3887                                         vget_low_u8(output_activation_min_vec));
3888               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
3889                                         vget_low_u8(output_activation_max_vec));
3890 
3891               util_vst1_u8(output_data, acc_u8_0_0);
3892 
3893               biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
3894               biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
3895               biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
3896               biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
3897               biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
3898               biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
3899 
3900               output_data += depth;
3901             }
3902           }
3903           input_data_base += workspace_height_stride;
3904           output_data_base += output_height_stride;
3905         }
3906       }
3907       input_data_depthwise += depth_micro_stride;
3908       output_data_depthwise += 8;
3909     }
3910   }  // NOLINT(readability/fn_size) Manually unrolled.
3911 
3912   static inline void Run(const int8* scratch_block_data,
3913                          const int8* filter_workspace, const int32* bias_data,
3914                          uint8* output_block_data,
3915                          const DepthwiseConvDotProdParams* function_params) {
3916     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
3917                                output_block_data, function_params);
3918   }
3919 };
3920 
3921 template <>
3922 struct KernelMacroBlock<
3923     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3924     QuantizationType::kNonPerChannelUint8,
3925     DepthwiseConvDepthMultiplication::kNoMultiplication,
3926     /*stride=*/2> {
3927   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3928   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3929     return vmin_u8(a, b);
3930   }
3931   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3932     return vmax_u8(a, b);
3933   }
3934 
3935   static inline void KernelMacroBlockIntrinsics(
3936       const int8* scratch_block_data, const int8* filter_workspace,
3937       const int32* bias_data, uint8* output_block_data,
3938       const DepthwiseConvDotProdParams* function_params) {
3939     static constexpr QuantizationType quantization_type =
3940         QuantizationType::kNonPerChannelUint8;
3941 
3942     const int workspace_height_stride =
3943         function_params->workspace_height_stride;
3944     const int input_width_overall_micro_repeats =
3945         function_params->input_width_overall_micro_repeats;
3946     const int output_width_micro_repeats =
3947         function_params->output_width_micro_repeats;
3948     const int depth_micro_repeats = function_params->depth_micro_repeats;
3949     const int depth = function_params->input_depth;
3950     constexpr int kStrideVal = 2;
3951     constexpr int kFourOverStride = 2;
3952     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
3953     TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
3954 
3955     const int workspace_width_micro_repeats =
3956         function_params->workspace_width_micro_repeats;
3957     const int output_width_overall_micro_repeats =
3958         function_params->output_width_overall_micro_repeats;
3959     const int block_height = function_params->outbound_block_height;
3960     const int residual_width = function_params->output_residual_width;
3961     const int output_height_stride = function_params->output_height_stride;
3962     constexpr int kBiasIncrement = 4;
3963 
3964     TFLITE_DCHECK(depth_micro_repeats > 0);
3965     const int width_micro_stride = 4 * 8;
3966     const int depth_micro_stride =
3967         width_micro_stride * input_width_overall_micro_repeats;
3968 
3969     const int32 output_activation_min =
3970         function_params->quantized_activation_min;
3971     const int32 output_activation_max =
3972         function_params->quantized_activation_max;
3973     const int32 output_multiplier = function_params->output_multiplier;
3974     const int32 output_shift = function_params->output_shift;
3975     const int32 output_offset = function_params->output_offset;
3976     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3977       TFLITE_DCHECK_GE(output_activation_min, 0);
3978       TFLITE_DCHECK_LT(output_activation_min, 256);
3979       TFLITE_DCHECK_GE(output_activation_max, 0);
3980       TFLITE_DCHECK_LT(output_activation_max, 256);
3981     } else {
3982       TFLITE_DCHECK_GE(output_activation_min, -128);
3983       TFLITE_DCHECK_LT(output_activation_min, 128);
3984       TFLITE_DCHECK_GE(output_activation_max, -128);
3985       TFLITE_DCHECK_LT(output_activation_max, 128);
3986     }
3987     TFLITE_DCHECK_GE(output_offset, -32878);
3988     TFLITE_DCHECK_LT(output_offset, 32768);
3989 
3990     // This version only does min/max on 64 bits.
3991     const int16x8_t output_offset_vec =
3992         vdupq_n_s16(static_cast<int16>(output_offset));
3993     const uint8x8_t output_activation_min_vec =
3994         vdup_n_u8(static_cast<uint8>(output_activation_min));
3995     const uint8x8_t output_activation_max_vec =
3996         vdup_n_u8(static_cast<uint8>(output_activation_max));
3997 
3998     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3999 
4000     TFLITE_DCHECK_LE(block_height, 2);
4001 
4002     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4003       const int8* filter_block =
4004           filter_workspace + shuffled_filter_increment * j_depth;
4005 
4006       if (block_height == 2) {
4007         for (int s = 0; s < 2; ++s) {
4008           // Simulate NEON-register transposition of subset of filter.
4009           int8x16_t filter_reg_0_a;
4010           int8x16_t filter_reg_1_a;
4011           int8x16_t filter_reg_2_a;
4012 
4013           filter_reg_0_a = vld1q_s8(filter_block + s * 16);
4014           filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
4015           filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
4016 
4017           const int8* scratch_data =
4018               scratch_block_data + depth_micro_stride * j_depth;
4019           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4020               output_data = output_block_data + 8 * j_depth;
4021           const int8* input_data_0 = scratch_data + s * 2 * 8;
4022 
4023           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4024 
4025           // Load first sub-micro block of data into operational banks.
4026           int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
4027           int8x16_t left_bank_1_reg =
4028               vld1q_s8(input_data_0 + workspace_height_stride);
4029           int8x16_t left_bank_2_reg =
4030               vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4031           int8x16_t left_bank_3_reg =
4032               vld1q_s8(input_data_0 + 3 * workspace_height_stride);
4033           int8x16_t left_bank_4_reg =
4034               vld1q_s8(input_data_0 + 4 * workspace_height_stride);
4035 
4036           int8x16_t right_bank_0_reg;
4037           int8x16_t right_bank_1_reg;
4038           int8x16_t right_bank_2_reg;
4039           int8x16_t right_bank_3_reg;
4040           int8x16_t right_bank_4_reg;
4041 
4042           int32x4_t acc0;
4043           int32x4_t acc1;
4044           int16x8_t acc_s16_0_1;
4045           uint8x8_t acc_u8;
4046 
4047           int i_width = 0;
4048 
4049           // When output_width_micro_repeats <
4050           // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
4051           // residual_width == 1 is then true iff residual_width < 2.
4052           const int adjusted_width_micro_repeats =
4053               (output_width_micro_repeats <
4054                output_width_overall_micro_repeats) &&
4055                       (residual_width == 1)
4056                   ? output_width_micro_repeats
4057                   : output_width_overall_micro_repeats;
4058 
4059           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4060             const int output_width = kFourOverStride;
4061             TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4062             const int8* input_data =
4063                 input_data_0 + width_micro_stride * i_width;
4064             acc0 = adjusted_bias_data;
4065             acc1 = adjusted_bias_data;
4066             right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
4067             right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
4068                                         workspace_height_stride);
4069 
4070             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4071             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4072             typename QuantizationTypeImpl<quantization_type>::ExternalType*
4073                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4074 
4075             right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
4076                                         2 * workspace_height_stride);
4077             right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
4078                                         3 * workspace_height_stride);
4079             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4080             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4081             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4082             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4083             right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
4084                                         4 * workspace_height_stride);
4085 
4086             // Fixed-point multiplication.
4087             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4088             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4089                 acc0, -output_shift);
4090             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4091             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4092                 acc1, -output_shift);
4093             // Add the output offset.
4094             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4095             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4096             // Apply the activation function.
4097             acc_u8 = vqmovxn_s16(acc_s16_0_1);
4098             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4099             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4100 
4101             left_bank_0_reg = vreinterpretq_s8_u16(
4102                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
4103             left_bank_1_reg = vreinterpretq_s8_u16(
4104                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
4105             left_bank_2_reg = vreinterpretq_s8_u16(
4106                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
4107             left_bank_3_reg = vreinterpretq_s8_u16(
4108                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
4109             left_bank_4_reg = vreinterpretq_s8_u16(
4110                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
4111             acc0 = adjusted_bias_data;
4112             acc1 = adjusted_bias_data;
4113             vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4114             vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4115             vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4116             vst1_lane_u8x4(output_data_base, acc_u8, 0);
4117             vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8, 1);
4118 
4119             vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4120             vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4121 
4122             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4123             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4124             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4125             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4126             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4127             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4128 
4129             // Fixed-point multiplication.
4130             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4131             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4132                 acc0, -output_shift);
4133             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4134             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4135                 acc1, -output_shift);
4136             // Add the output offset.
4137             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4138             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4139             // Apply the activation function.
4140             acc_u8 = vqmovxn_s16(acc_s16_0_1);
4141             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4142             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4143 
4144             vst1_lane_u8x4(output_data_base + depth, acc_u8, 0);
4145             vst1_lane_u8x4(output_data_base + depth + output_height_stride,
4146                            acc_u8, 1);
4147 
4148             left_bank_0_reg = right_bank_0_reg;
4149             left_bank_1_reg = right_bank_1_reg;
4150             left_bank_2_reg = right_bank_2_reg;
4151             left_bank_3_reg = right_bank_3_reg;
4152             left_bank_4_reg = right_bank_4_reg;
4153           }
4154           for (; i_width < output_width_overall_micro_repeats; ++i_width) {
4155             TFLITE_DCHECK_NE(residual_width, kFourOverStride);
4156 
4157             // No need to load next ("right") block of data.
4158 
4159             typename QuantizationTypeImpl<quantization_type>::ExternalType*
4160                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4161 
4162             // Iterate over input width shifts within 4x4 blocks.
4163             {
4164               acc0 = adjusted_bias_data;
4165               acc1 = adjusted_bias_data;
4166 
4167               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4168               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4169               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4170               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4171               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4172               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4173 
4174               // Fixed-point multiplication.
4175               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4176               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4177                   acc0, -output_shift);
4178               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4179               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4180                   acc1, -output_shift);
4181               // Add the output offset.
4182               int16x8_t acc_s16_0_1 =
4183                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4184               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4185               // Apply the activation function.
4186               uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4187               acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4188               acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4189 
4190               vst1_lane_u8x4(output_data_base, acc_u8, 0);
4191               vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8,
4192                              1);
4193 
4194               left_bank_0_reg = vreinterpretq_s8_u16(
4195                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
4196               left_bank_1_reg = vreinterpretq_s8_u16(
4197                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
4198               left_bank_2_reg = vreinterpretq_s8_u16(
4199                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
4200               left_bank_3_reg = vreinterpretq_s8_u16(
4201                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
4202               left_bank_4_reg = vreinterpretq_s8_u16(
4203                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
4204               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4205               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4206               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4207               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4208               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4209             }
4210           }
4211           bias_data += kBiasIncrement;
4212         }
4213       } else {
4214         // block_height == 1.
4215         int8x16_t filter_reg_0_a;
4216         int8x16_t filter_reg_1_a;
4217         int8x16_t filter_reg_2_a;
4218         int8x16_t filter_reg_0_b;
4219         int8x16_t filter_reg_1_b;
4220         int8x16_t filter_reg_2_b;
4221 
4222         filter_reg_0_a = vld1q_s8(filter_block);
4223         filter_reg_1_a = vld1q_s8(filter_block + 32);
4224         filter_reg_2_a = vld1q_s8(filter_block + 64);
4225         filter_reg_0_b = vld1q_s8(filter_block + 16);
4226         filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
4227         filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
4228 
4229         const int8* scratch_data =
4230             scratch_block_data + depth_micro_stride * j_depth;
4231         typename QuantizationTypeImpl<quantization_type>::ExternalType*
4232             output_data = output_block_data + 8 * j_depth;
4233         const int8* input_data_0 = scratch_data;
4234 
4235         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4236         bias_data += kBiasIncrement;
4237         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4238         bias_data += kBiasIncrement;
4239 
4240         // Load first sub-micro block of data into operational banks.
4241         int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
4242         int8x16_t left_bank_1_reg_a =
4243             vld1q_s8(input_data_0 + workspace_height_stride);
4244         int8x16_t left_bank_2_reg_a =
4245             vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4246         int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
4247         int8x16_t left_bank_1_reg_b =
4248             vld1q_s8(input_data_0 + workspace_height_stride + 16);
4249         int8x16_t left_bank_2_reg_b =
4250             vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
4251 
4252         int8x16_t right_bank_0_reg_a;
4253         int8x16_t right_bank_1_reg_a;
4254         int8x16_t right_bank_2_reg_a;
4255         int8x16_t right_bank_0_reg_b;
4256         int8x16_t right_bank_1_reg_b;
4257         int8x16_t right_bank_2_reg_b;
4258 
4259         int32x4_t acc0_a;
4260         int32x4_t acc0_b;
4261 
4262         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4263              ++i_width) {
4264           const int output_width = i_width == output_width_micro_repeats
4265                                        ? residual_width
4266                                        : kFourOverStride;
4267           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4268           const int8* input_data = input_data_0 + width_micro_stride * i_width;
4269           const bool no_right_block = i_width == output_width_micro_repeats &&
4270                                       output_width_overall_micro_repeats ==
4271                                           workspace_width_micro_repeats;
4272 
4273           if (!no_right_block) {
4274             // Load next sub-micro block of data.
4275             right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
4276             right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
4277                                           workspace_height_stride);
4278             right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
4279                                           2 * workspace_height_stride);
4280             right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
4281             right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
4282                                           workspace_height_stride + 16);
4283             right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
4284                                           2 * workspace_height_stride + 16);
4285           }
4286 
4287           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4288               output_data_base = output_data + depth * 2 * i_width;
4289 
4290           // Iterate over input width shifts within 4x4 blocks.
4291           {
4292             acc0_a = adjusted_bias_data_a;
4293             acc0_b = adjusted_bias_data_b;
4294 
4295             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4296             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4297             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4298             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4299             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4300             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4301 
4302             // Fixed-point multiplication.
4303             acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4304             acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4305             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4306                 acc0_a, -output_shift);
4307             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4308                 acc0_b, -output_shift);
4309             // Add the output offset.
4310             int16x8_t acc_s16_0_1 =
4311                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4312             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4313             // Apply the activation function.
4314             uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4315             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4316             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4317 
4318             util_vst1_u8(output_data_base, acc_u8);
4319 
4320             left_bank_0_reg_a = vreinterpretq_s8_u16(
4321                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
4322             left_bank_1_reg_a = vreinterpretq_s8_u16(
4323                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
4324             left_bank_2_reg_a = vreinterpretq_s8_u16(
4325                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
4326             left_bank_0_reg_b = vreinterpretq_s8_u16(
4327                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
4328             left_bank_1_reg_b = vreinterpretq_s8_u16(
4329                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
4330             left_bank_2_reg_b = vreinterpretq_s8_u16(
4331                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
4332             vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
4333             vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
4334             vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
4335             vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
4336             vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
4337             vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
4338           }
4339 
4340           if (output_width > 1) {
4341             acc0_a = adjusted_bias_data_a;
4342             acc0_b = adjusted_bias_data_b;
4343 
4344             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4345             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4346             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4347             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4348             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4349             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4350 
4351             // Fixed-point multiplication.
4352             acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4353             acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4354             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4355                 acc0_a, -output_shift);
4356             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4357                 acc0_b, -output_shift);
4358             // Add the output offset.
4359             int16x8_t acc_s16_0_1 =
4360                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4361             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4362             // Apply the activation function.
4363             uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4364             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4365             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4366 
4367             util_vst1_u8(output_data_base + depth, acc_u8);
4368 
4369             left_bank_0_reg_a = right_bank_0_reg_a;
4370             left_bank_1_reg_a = right_bank_1_reg_a;
4371             left_bank_2_reg_a = right_bank_2_reg_a;
4372             left_bank_0_reg_b = right_bank_0_reg_b;
4373             left_bank_1_reg_b = right_bank_1_reg_b;
4374             left_bank_2_reg_b = right_bank_2_reg_b;
4375           }
4376         }
4377       }
4378     }
4379   }  // NOLINT(readability/fn_size) Manually unrolled.
4380 
4381   static inline void Run(const int8* scratch_block_data,
4382                          const int8* filter_workspace, const int32* bias_data,
4383                          uint8* output_block_data,
4384                          const DepthwiseConvDotProdParams* function_params) {
4385     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
4386                                output_block_data, function_params);
4387   }
4388 };
4389 
4390 template <>
4391 struct KernelMacroBlock<
4392     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
4393     QuantizationType::kNonPerChannelUint8,
4394     DepthwiseConvDepthMultiplication::kUnitInputDepth,
4395     /*stride=*/1> {
4396   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
4397   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
4398     return vmin_u8(a, b);
4399   }
4400   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
4401     return vmax_u8(a, b);
4402   }
4403   static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
4404     return vminq_u8(a, b);
4405   }
4406   static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
4407     return vmaxq_u8(a, b);
4408   }
4409 
4410   static inline void KernelMacroBlockIntrinsics(
4411       const int8* scratch_block_data, const int8* filter_workspace,
4412       const int32* bias_data, uint8* output_block_data,
4413       const DepthwiseConvDotProdParams* function_params) {
4414     static constexpr QuantizationType quantization_type =
4415         QuantizationType::kNonPerChannelUint8;
4416 
4417     TFLITE_DCHECK_EQ(function_params->stride, 1);
4418     const int workspace_height_stride =
4419         function_params->workspace_height_stride;
4420     const int output_width_micro_repeats =
4421         function_params->output_width_micro_repeats;
4422     const int depth_micro_repeats = function_params->depth_micro_repeats;
4423     const int output_depth = function_params->output_depth;
4424 
4425     const int output_width_overall_micro_repeats =
4426         function_params->output_width_overall_micro_repeats;
4427     const int block_height = function_params->outbound_block_height;
4428     const int residual_width = function_params->output_residual_width;
4429     const int output_height_stride = function_params->output_height_stride;
4430     constexpr int kBiasIncrement = 4;
4431 
4432     TFLITE_DCHECK(depth_micro_repeats > 0);
4433 
4434     const int32 output_activation_min =
4435         function_params->quantized_activation_min;
4436     const int32 output_activation_max =
4437         function_params->quantized_activation_max;
4438     const int32 output_multiplier = function_params->output_multiplier;
4439     const int32 output_shift = function_params->output_shift;
4440     const int32 output_offset = function_params->output_offset;
4441     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
4442       TFLITE_DCHECK_GE(output_activation_min, 0);
4443       TFLITE_DCHECK_LT(output_activation_min, 256);
4444       TFLITE_DCHECK_GE(output_activation_max, 0);
4445       TFLITE_DCHECK_LT(output_activation_max, 256);
4446     } else {
4447       TFLITE_DCHECK_GE(output_activation_min, -128);
4448       TFLITE_DCHECK_LT(output_activation_min, 128);
4449       TFLITE_DCHECK_GE(output_activation_max, -128);
4450       TFLITE_DCHECK_LT(output_activation_max, 128);
4451     }
4452     TFLITE_DCHECK_GE(output_offset, -32878);
4453     TFLITE_DCHECK_LT(output_offset, 32768);
4454 
4455     const int16x8_t output_offset_vec =
4456         vdupq_n_s16(static_cast<int16>(output_offset));
4457     const uint8x16_t output_activation_min_vec =
4458         vdupq_n_u8(static_cast<uint8>(output_activation_min));
4459     const uint8x16_t output_activation_max_vec =
4460         vdupq_n_u8(static_cast<uint8>(output_activation_max));
4461 
4462     typename QuantizationTypeImpl<quantization_type>::ExternalType*
4463         output_data_depthwise = output_block_data;
4464     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4465       // Simulate NEON-register transposition of subset of filter.
4466       int8x16_t filter_reg_0_a;
4467       int8x16_t filter_reg_0_b;
4468       int8x16_t filter_reg_1_a;
4469       int8x16_t filter_reg_1_b;
4470       int8x16_t filter_reg_2_a;
4471       int8x16_t filter_reg_2_b;
4472       int8x16_t filter_reg_0_a_shifted;
4473       int8x16_t filter_reg_1_a_shifted;
4474       int8x16_t filter_reg_2_a_shifted;
4475 
4476       filter_reg_0_a = vld1q_s8(filter_workspace);
4477       filter_workspace += 16;
4478       filter_reg_0_b = vld1q_s8(filter_workspace);
4479       filter_workspace += 16;
4480       filter_reg_1_a = vld1q_s8(filter_workspace);
4481       filter_workspace += 16;
4482       filter_reg_1_b = vld1q_s8(filter_workspace);
4483       filter_workspace += 16;
4484       filter_reg_2_a = vld1q_s8(filter_workspace);
4485       filter_workspace += 16;
4486       filter_reg_2_b = vld1q_s8(filter_workspace);
4487       filter_workspace += 16;
4488 
4489       filter_reg_0_a_shifted = vreinterpretq_s8_u32(
4490           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
4491       filter_reg_1_a_shifted = vreinterpretq_s8_u32(
4492           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
4493       filter_reg_2_a_shifted = vreinterpretq_s8_u32(
4494           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
4495 
4496       // When output_width_micro_repeats < output_width_overall_micro_repeats,
4497       // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
4498       // residual_width < 2.
4499       const int adjusted_width_micro_repeats =
4500           (output_width_micro_repeats < output_width_overall_micro_repeats) &&
4501                   (residual_width < 4)
4502               ? output_width_micro_repeats
4503               : output_width_overall_micro_repeats;
4504 
4505       if (block_height == 4) {
4506         for (int s = 0; s < 2; ++s) {
4507           // Work through one slice, by row, at a time.
4508           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4509               output_data_base = output_data_depthwise + 4 * s;
4510 
4511           const int8* next_input_data = scratch_block_data;
4512           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4513               output_data = output_data_base;
4514 
4515           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4516           bias_data += kBiasIncrement;
4517 
4518           int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
4519           int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
4520           int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
4521 
4522           // Load first sub-micro block of data into operational banks.
4523           input_bank_a_reg =
4524               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
4525                                                 // uninitialized variable.
4526           input_bank_a_reg = vld1q_lane_8x4(
4527               next_input_data + workspace_height_stride, input_bank_a_reg, 2);
4528           input_bank_b_reg = vld1q_dup_s8x4(
4529               next_input_data +
4530               2 * workspace_height_stride);  // Load lane 0, avoiding
4531                                              // uninitialized variable.
4532           input_bank_b_reg =
4533               vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4534                              input_bank_b_reg, 2);
4535           input_bank_c_reg = vld1q_dup_s8x4(
4536               next_input_data +
4537               4 * workspace_height_stride);  // Load lane 0, avoiding
4538                                              // uninitialized variable.
4539           input_bank_c_reg =
4540               vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4541                              input_bank_c_reg, 2);
4542 
4543           int32x4_t acc0;
4544           int32x4_t acc1;
4545           int32x4_t acc2;
4546           int32x4_t acc3;
4547 
4548           acc0 = adjusted_bias_data;
4549           acc1 = adjusted_bias_data;
4550           acc2 = adjusted_bias_data;
4551           acc3 = adjusted_bias_data;
4552 
4553           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
4554           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
4555           acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
4556           acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
4557 
4558           int i_width = 0;
4559           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4560             next_input_data += 4;
4561 
4562             // Iterate over input width shifts within 4x4 blocks.
4563             {
4564               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4565                                          0);
4566               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4567                                          2);
4568               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4569                                          2);
4570               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4571                                          2);
4572               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4573                                          2);
4574               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4575                                          0);
4576               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4577                                          0);
4578               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4579                                          2);
4580 
4581               // Fixed-point multiplication.
4582               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4583               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4584                   acc0, -output_shift);
4585               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4586               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4587                   acc1, -output_shift);
4588               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4589               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4590                   acc2, -output_shift);
4591               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4592               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4593                   acc3, -output_shift);
4594               // Add the output offset.
4595               int16x8_t acc_s16_0_1 =
4596                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4597               int16x8_t acc_s16_2_3 =
4598                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4599               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4600               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4601               // Apply the activation function.
4602               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4603                                                   vqmovxn_s16(acc_s16_2_3));
4604               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4605               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4606 
4607               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4608               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4609                               1);
4610               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4611                               acc_u8_all, 2);
4612               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4613                               acc_u8_all, 3);
4614 
4615               output_data += output_depth;
4616             }
4617             // Load next sub-micro block of data.
4618             input_bank_a_reg =
4619                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4620             input_bank_a_reg = vld1q_lane_8x4(
4621                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4622             input_bank_b_reg =
4623                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4624                                input_bank_b_reg, 1);
4625             input_bank_b_reg =
4626                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4627                                input_bank_b_reg, 3);
4628             input_bank_c_reg =
4629                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4630                                input_bank_c_reg, 1);
4631             input_bank_c_reg =
4632                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4633                                input_bank_c_reg, 3);
4634 
4635             {
4636               acc0 = adjusted_bias_data;
4637               acc1 = adjusted_bias_data;
4638               acc2 = adjusted_bias_data;
4639               acc3 = adjusted_bias_data;
4640 
4641               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4642                                          input_bank_a_reg, 0);
4643               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4644                                          input_bank_a_reg, 2);
4645               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4646                                          input_bank_b_reg, 0);
4647               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4648                                          input_bank_a_reg, 2);
4649               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4650                                          input_bank_b_reg, 0);
4651               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4652                                          input_bank_b_reg, 2);
4653               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4654                                          input_bank_b_reg, 0);
4655               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4656                                          input_bank_b_reg, 2);
4657               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4658                                          input_bank_c_reg, 0);
4659               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4660                                          input_bank_b_reg, 2);
4661               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4662                                          input_bank_c_reg, 0);
4663               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4664                                          input_bank_c_reg, 2);
4665 
4666               // Fixed-point multiplication.
4667               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4668               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4669                   acc0, -output_shift);
4670               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4671               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4672                   acc1, -output_shift);
4673               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4674               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4675                   acc2, -output_shift);
4676               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4677               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4678                   acc3, -output_shift);
4679               // Add the output offset.
4680               int16x8_t acc_s16_0_1 =
4681                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4682               int16x8_t acc_s16_2_3 =
4683                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4684               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4685               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4686               // Apply the activation function.
4687               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4688                                                   vqmovxn_s16(acc_s16_2_3));
4689               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4690               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4691 
4692               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4693               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4694                               1);
4695               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4696                               acc_u8_all, 2);
4697               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4698                               acc_u8_all, 3);
4699 
4700               input_bank_a_reg = vreinterpretq_s8_u64(
4701                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
4702               input_bank_b_reg = vreinterpretq_s8_u64(
4703                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
4704               input_bank_c_reg = vreinterpretq_s8_u64(
4705                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
4706 
4707               output_data += output_depth;
4708             }
4709 
4710             {
4711               acc0 = adjusted_bias_data;
4712               acc1 = adjusted_bias_data;
4713               acc2 = adjusted_bias_data;
4714               acc3 = adjusted_bias_data;
4715 
4716               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4717                                          0);
4718               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4719                                          2);
4720               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4721                                          0);
4722               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4723                                          2);
4724               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4725                                          0);
4726               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4727                                          2);
4728               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4729                                          0);
4730               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4731                                          2);
4732               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4733                                          0);
4734               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4735                                          2);
4736               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4737                                          0);
4738               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4739                                          2);
4740 
4741               // Fixed-point multiplication.
4742               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4743               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4744                   acc0, -output_shift);
4745               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4746               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4747                   acc1, -output_shift);
4748               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4749               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4750                   acc2, -output_shift);
4751               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4752               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4753                   acc3, -output_shift);
4754               // Add the output offset.
4755               int16x8_t acc_s16_0_1 =
4756                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4757               int16x8_t acc_s16_2_3 =
4758                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4759               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4760               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4761               // Apply the activation function.
4762               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4763                                                   vqmovxn_s16(acc_s16_2_3));
4764               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4765               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4766 
4767               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4768               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4769                               1);
4770               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4771                               acc_u8_all, 2);
4772               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4773                               acc_u8_all, 3);
4774 
4775               output_data += output_depth;
4776             }
4777 
4778             {
4779               acc0 = adjusted_bias_data;
4780               acc1 = adjusted_bias_data;
4781               acc2 = adjusted_bias_data;
4782               acc3 = adjusted_bias_data;
4783 
4784               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4785                                          input_bank_a_reg, 0);
4786               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4787                                          input_bank_a_reg, 2);
4788               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4789                                          input_bank_b_reg, 0);
4790               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4791                                          input_bank_a_reg, 2);
4792               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4793                                          input_bank_b_reg, 0);
4794               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4795                                          input_bank_b_reg, 2);
4796               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4797                                          input_bank_b_reg, 0);
4798               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4799                                          input_bank_b_reg, 2);
4800               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4801                                          input_bank_c_reg, 0);
4802               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4803                                          input_bank_b_reg, 2);
4804               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4805                                          input_bank_c_reg, 0);
4806               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4807                                          input_bank_c_reg, 2);
4808 
4809               // Fixed-point multiplication.
4810               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4811               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4812                   acc0, -output_shift);
4813               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4814               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4815                   acc1, -output_shift);
4816               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4817               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4818                   acc2, -output_shift);
4819               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4820               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4821                   acc3, -output_shift);
4822               // Add the output offset.
4823               int16x8_t acc_s16_0_1 =
4824                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4825               int16x8_t acc_s16_2_3 =
4826                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4827               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4828               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4829               // Apply the activation function.
4830               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4831                                                   vqmovxn_s16(acc_s16_2_3));
4832               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4833               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4834 
4835               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4836               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4837                               1);
4838               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4839                               acc_u8_all, 2);
4840               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4841                               acc_u8_all, 3);
4842 
4843               input_bank_a_reg = vreinterpretq_s8_u64(
4844                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
4845               input_bank_b_reg = vreinterpretq_s8_u64(
4846                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
4847               input_bank_c_reg = vreinterpretq_s8_u64(
4848                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
4849 
4850               output_data += output_depth;
4851               acc0 = adjusted_bias_data;
4852               acc1 = adjusted_bias_data;
4853               acc2 = adjusted_bias_data;
4854               acc3 = adjusted_bias_data;
4855 
4856               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4857                                          0);
4858               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4859                                          0);
4860               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4861                                          0);
4862               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4863                                          2);
4864             }
4865           }
4866 
4867           if (i_width < output_width_overall_micro_repeats) {
4868             next_input_data += 4;
4869             const int output_width = residual_width;
4870 
4871             // Load next sub-micro block of data.
4872             input_bank_a_reg =
4873                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4874             input_bank_a_reg = vld1q_lane_8x4(
4875                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4876             input_bank_b_reg =
4877                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4878                                input_bank_b_reg, 1);
4879             input_bank_b_reg =
4880                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4881                                input_bank_b_reg, 3);
4882             input_bank_c_reg =
4883                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4884                                input_bank_c_reg, 1);
4885             input_bank_c_reg =
4886                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4887                                input_bank_c_reg, 3);
4888 
4889             // Iterate over input width shifts within 4x4 blocks.
4890             for (int x = 0; x < output_width; ++x) {
4891               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4892                                          0);
4893               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4894                                          2);
4895               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4896                                          2);
4897               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4898                                          2);
4899               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4900                                          2);
4901               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4902                                          0);
4903               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4904                                          0);
4905               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4906                                          2);
4907 
4908               // Fixed-point multiplication.
4909               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4910               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4911                   acc0, -output_shift);
4912               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4913               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4914                   acc1, -output_shift);
4915               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4916               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4917                   acc2, -output_shift);
4918               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4919               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4920                   acc3, -output_shift);
4921               // Add the output offset.
4922               int16x8_t acc_s16_0_1 =
4923                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4924               int16x8_t acc_s16_2_3 =
4925                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4926               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4927               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4928               // Apply the activation function.
4929               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4930                                                   vqmovxn_s16(acc_s16_2_3));
4931               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4932               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4933 
4934               vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4935               vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4936                               1);
4937               vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4938                               acc_u8_all, 2);
4939               vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4940                               acc_u8_all, 3);
4941 
4942               input_bank_a_reg = vreinterpretq_s8_u64(
4943                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
4944               input_bank_b_reg = vreinterpretq_s8_u64(
4945                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
4946               input_bank_c_reg = vreinterpretq_s8_u64(
4947                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
4948 
4949               output_data += output_depth;
4950 
4951               acc0 = adjusted_bias_data;
4952               acc1 = adjusted_bias_data;
4953               acc2 = adjusted_bias_data;
4954               acc3 = adjusted_bias_data;
4955 
4956               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4957                                          0);
4958               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4959                                          0);
4960               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4961                                          0);
4962               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4963                                          2);
4964             }
4965           }
4966           // scratch_block_data += 4 * workspace_height_stride;
4967           output_data_base += 4 * output_height_stride;
4968 
4969           // Move to next sub-block: advance to second set of filters, to new
4970           // bias.
4971           filter_reg_0_a = filter_reg_0_b;
4972           filter_reg_1_a = filter_reg_1_b;
4973           filter_reg_2_a = filter_reg_2_b;
4974           filter_reg_0_a_shifted = vreinterpretq_s8_u32(
4975               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
4976           filter_reg_1_a_shifted = vreinterpretq_s8_u32(
4977               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
4978           filter_reg_2_a_shifted = vreinterpretq_s8_u32(
4979               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
4980         }
4981       } else {
4982         // Block height < 4.
4983         typename QuantizationTypeImpl<quantization_type>::ExternalType*
4984             output_data_base = output_data_depthwise;
4985 
4986         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4987         bias_data += kBiasIncrement;
4988         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4989         bias_data += kBiasIncrement;
4990 
4991         for (int k_height = 0; k_height < block_height; ++k_height) {
4992           const int8* next_input_data =
4993               scratch_block_data + k_height * workspace_height_stride;
4994           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4995               output_data = output_data_base;
4996 
4997           int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
4998           int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
4999 
5000           // Load first sub-micro block of data into operational banks.
5001           input_bank_p_reg =
5002               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
5003                                                 // uninitialized variable.
5004           input_bank_p_reg = vld1q_lane_8x4(
5005               next_input_data + workspace_height_stride, input_bank_p_reg, 2);
5006           input_bank_q_reg = vld1q_dup_s8x4(
5007               next_input_data +
5008               2 * workspace_height_stride);  // Load lane 0, avoiding
5009                                              // uninitialized variable.
5010 
5011           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5012                ++i_width) {
5013             next_input_data += 4;
5014             const int output_width =
5015                 i_width == output_width_micro_repeats ? residual_width : 4;
5016 
5017             // Load next sub-micro block of data.
5018             input_bank_p_reg =
5019                 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
5020             input_bank_p_reg = vld1q_lane_8x4(
5021                 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
5022             input_bank_q_reg =
5023                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
5024                                input_bank_q_reg, 1);
5025             // Iterate over input width shifts within 4x4 blocks.
5026             for (int x = 0; x < output_width; ++x) {
5027               int32x4_t acc_a = adjusted_bias_data_a;
5028               int32x4_t acc_b = adjusted_bias_data_b;
5029               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
5030                                           input_bank_p_reg, 0);
5031               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
5032                                           input_bank_p_reg, 2);
5033               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
5034                                           input_bank_q_reg, 0);
5035               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
5036                                           input_bank_p_reg, 0);
5037               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
5038                                           input_bank_p_reg, 2);
5039               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
5040                                           input_bank_q_reg, 0);
5041 
5042               // Fixed-point multiplication.
5043               acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
5044               acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
5045               acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5046                   acc_a, -output_shift);
5047               acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5048                   acc_b, -output_shift);
5049               // Add the output offset.
5050               int16x8_t acc_s16_0_0 =
5051                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
5052               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
5053               // Apply the activation function.
5054               uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
5055               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
5056                                         vget_low_u8(output_activation_min_vec));
5057               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
5058                                         vget_low_u8(output_activation_max_vec));
5059 
5060               util_vst1_u8(output_data, acc_u8_0_0);
5061 
5062               input_bank_p_reg = vreinterpretq_s8_u64(
5063                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
5064               input_bank_q_reg = vreinterpretq_s8_u64(
5065                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
5066 
5067               output_data += output_depth;
5068             }
5069           }
5070           output_data_base += output_height_stride;
5071         }
5072       }
5073       output_data_depthwise += 8;
5074     }
5075   }  // NOLINT(readability/fn_size) Manually unrolled.
5076 
5077   static inline void Run(const int8* scratch_block_data,
5078                          const int8* filter_workspace, const int32* bias_data,
5079                          uint8* output_block_data,
5080                          const DepthwiseConvDotProdParams* function_params) {
5081     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5082                                output_block_data, function_params);
5083   }
5084 };
5085 
5086 template <>
5087 struct KernelMacroBlock<
5088     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5089     QuantizationType::kNonPerChannelUint8,
5090     DepthwiseConvDepthMultiplication::kUnitInputDepth,
5091     /*stride=*/2> {
5092   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
5093   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
5094     return vmin_u8(a, b);
5095   }
5096   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
5097     return vmax_u8(a, b);
5098   }
5099 
5100   static inline void KernelMacroBlockIntrinsics(
5101       const int8* scratch_block_data, const int8* filter_workspace,
5102       const int32* bias_data, uint8* output_block_data,
5103       const DepthwiseConvDotProdParams* function_params) {
5104     static constexpr QuantizationType quantization_type =
5105         QuantizationType::kNonPerChannelUint8;
5106 
5107     const int workspace_height_stride =
5108         function_params->workspace_height_stride;
5109     const int output_width_micro_repeats =
5110         function_params->output_width_micro_repeats;
5111     const int depth_micro_repeats = function_params->depth_micro_repeats;
5112     const int output_depth = function_params->output_depth;
5113     constexpr int kStrideVal = 2;
5114     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
5115 
5116     const int output_width_overall_micro_repeats =
5117         function_params->output_width_overall_micro_repeats;
5118     const int block_height = function_params->outbound_block_height;
5119     const int residual_width = function_params->output_residual_width;
5120     const int output_height_stride = function_params->output_height_stride;
5121     constexpr int kBiasIncrement = 4;
5122 
5123     const int32 output_activation_min =
5124         function_params->quantized_activation_min;
5125     const int32 output_activation_max =
5126         function_params->quantized_activation_max;
5127     const int32 output_multiplier = function_params->output_multiplier;
5128     const int32 output_shift = function_params->output_shift;
5129     const int32 output_offset = function_params->output_offset;
5130     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5131       TFLITE_DCHECK_GE(output_activation_min, 0);
5132       TFLITE_DCHECK_LT(output_activation_min, 256);
5133       TFLITE_DCHECK_GE(output_activation_max, 0);
5134       TFLITE_DCHECK_LT(output_activation_max, 256);
5135     } else {
5136       TFLITE_DCHECK_GE(output_activation_min, -128);
5137       TFLITE_DCHECK_LT(output_activation_min, 128);
5138       TFLITE_DCHECK_GE(output_activation_max, -128);
5139       TFLITE_DCHECK_LT(output_activation_max, 128);
5140     }
5141     TFLITE_DCHECK_GE(output_offset, -32878);
5142     TFLITE_DCHECK_LT(output_offset, 32768);
5143 
5144     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
5145 
5146     const int16x8_t output_offset_vec =
5147         vdupq_n_s16(static_cast<int16>(output_offset));
5148     const uint8x16_t output_activation_min_vec =
5149         vdupq_n_u8(static_cast<uint8>(output_activation_min));
5150     const uint8x16_t output_activation_max_vec =
5151         vdupq_n_u8(static_cast<uint8>(output_activation_max));
5152 
5153     for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
5154       int8x16_t filter_reg_0_a;
5155       int8x16_t filter_reg_0_b;
5156       int8x16_t filter_reg_1_a;
5157       int8x16_t filter_reg_1_b;
5158       int8x16_t filter_reg_2_a;
5159       int8x16_t filter_reg_2_b;
5160 
5161       filter_reg_0_a = vld1q_s8(filter_workspace);
5162       filter_workspace += 16;
5163       filter_reg_0_b = vld1q_s8(filter_workspace);
5164       filter_workspace += 16;
5165       filter_reg_1_a = vld1q_s8(filter_workspace);
5166       filter_workspace += 16;
5167       filter_reg_1_b = vld1q_s8(filter_workspace);
5168       filter_workspace += 16;
5169       filter_reg_2_a = vld1q_s8(filter_workspace);
5170       filter_workspace += 16;
5171       filter_reg_2_b = vld1q_s8(filter_workspace);
5172       filter_workspace += 16;
5173 
5174       const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
5175       bias_data += kBiasIncrement;
5176       const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
5177       bias_data += kBiasIncrement;
5178 
5179       if (block_height == 2) {
5180         const int8* scratch_data = scratch_block_data;
5181         typename QuantizationTypeImpl<quantization_type>::ExternalType*
5182             output_data = output_block_data + 8 * j_depth;
5183 
5184         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
5185         int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
5186         int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
5187 
5188         // Load first sub-micro block of data into operational banks.
5189         input_bank_a_reg =
5190             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
5191                                            // uninitialized variable.
5192         input_bank_a_reg = vld1q_lane_8x4(
5193             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5194         input_bank_b_reg = vld1q_dup_s8x4(
5195             scratch_data +
5196             2 * workspace_height_stride);  // Load lane 0, avoiding
5197                                            // uninitialized variable.
5198         input_bank_b_reg = vld1q_lane_8x4(
5199             scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
5200         input_bank_c_reg = vld1q_dup_s8x4(
5201             scratch_data +
5202             4 * workspace_height_stride);  // Load lane 0, avoiding
5203                                            // uninitialized variable.
5204 
5205         int32x4_t acc0;
5206         int32x4_t acc1;
5207 
5208         // When output_width_micro_repeats < output_width_overall_micro_repeats,
5209         // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
5210         // residual_width < 2.
5211         const int adjusted_width_micro_repeats =
5212             (output_width_micro_repeats < output_width_overall_micro_repeats) &&
5213                     (residual_width < 2)
5214                 ? output_width_micro_repeats
5215                 : output_width_overall_micro_repeats;
5216 
5217         int i_width = 0;
5218         for (; i_width < adjusted_width_micro_repeats; ++i_width) {
5219           const int8* input_data = scratch_data + 4 + 4 * i_width;
5220 
5221           // Load next sub-micro block of data.
5222           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5223           input_bank_a_reg = vld1q_lane_8x4(
5224               input_data + workspace_height_stride, input_bank_a_reg, 3);
5225           input_bank_b_reg = vld1q_lane_8x4(
5226               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5227           input_bank_b_reg = vld1q_lane_8x4(
5228               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5229           input_bank_c_reg = vld1q_lane_8x4(
5230               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5231 
5232           int16x8_t acc_s16_0_1;
5233           uint8x8_t acc_u8_0_1;
5234           // Iterate over input width shifts within 4x4 blocks.
5235           {
5236             acc0 = adjusted_bias_data_s_0;
5237             acc1 = adjusted_bias_data_s_0;
5238 
5239             acc0 =
5240                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5241             acc0 =
5242                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5243             acc0 =
5244                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5245             acc1 =
5246                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5247             acc1 =
5248                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5249             acc1 =
5250                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5251 
5252             // Fixed-point multiplication.
5253             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5254             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5255                 acc0, -output_shift);
5256             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5257             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5258                 acc1, -output_shift);
5259             // Add the output offset.
5260             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5261             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5262             // Apply the activation function.
5263             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5264             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5265                                       vget_low_u8(output_activation_min_vec));
5266             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5267                                       vget_low_u8(output_activation_max_vec));
5268 
5269             vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5270             vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5271 
5272             acc0 = adjusted_bias_data_s_1;
5273             acc1 = adjusted_bias_data_s_1;
5274 
5275             acc0 =
5276                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5277             acc0 =
5278                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5279             acc0 =
5280                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5281             acc1 =
5282                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5283             acc1 =
5284                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5285             acc1 =
5286                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5287 
5288             // Fixed-point multiplication.
5289             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5290             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5291                 acc0, -output_shift);
5292             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5293             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5294                 acc1, -output_shift);
5295             // Add the output offset.
5296             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5297             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5298             // Apply the activation function.
5299             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5300             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5301                                       vget_low_u8(output_activation_min_vec));
5302             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5303                                       vget_low_u8(output_activation_max_vec));
5304 
5305             vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5306             vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5307                            1);
5308 
5309             input_bank_a_reg = vreinterpretq_s8_u64(
5310                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5311             input_bank_b_reg = vreinterpretq_s8_u64(
5312                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5313             input_bank_c_reg = vreinterpretq_s8_u64(
5314                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
5315 
5316             output_data += output_depth;
5317           }
5318 
5319           // output_width == four_over_stride.
5320           acc0 = adjusted_bias_data_s_0;
5321           acc1 = adjusted_bias_data_s_0;
5322 
5323           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5324           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5325           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5326           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5327           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5328           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5329 
5330           // Fixed-point multiplication.
5331           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5332           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5333               acc0, -output_shift);
5334           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5335           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5336               acc1, -output_shift);
5337           // Add the output offset.
5338           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5339           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5340           // Apply the activation function.
5341           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5342           acc_u8_0_1 =
5343               util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5344           acc_u8_0_1 =
5345               util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5346 
5347           vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5348           vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5349 
5350           acc0 = adjusted_bias_data_s_1;
5351           acc1 = adjusted_bias_data_s_1;
5352 
5353           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5354           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5355           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5356           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5357           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5358           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5359 
5360           // Fixed-point multiplication.
5361           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5362           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5363               acc0, -output_shift);
5364           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5365           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5366               acc1, -output_shift);
5367           // Add the output offset.
5368           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5369           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5370           // Apply the activation function.
5371           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5372           acc_u8_0_1 =
5373               util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5374           acc_u8_0_1 =
5375               util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5376 
5377           vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5378           vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
5379 
5380           input_bank_a_reg = vreinterpretq_s8_u64(
5381               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
5382           input_bank_b_reg = vreinterpretq_s8_u64(
5383               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
5384           input_bank_c_reg = vreinterpretq_s8_u64(
5385               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
5386 
5387           output_data += output_depth;
5388         }
5389         for (; i_width < output_width_overall_micro_repeats; ++i_width) {
5390           // output_width == 1.
5391           const int8* input_data = scratch_data + 4 + 4 * i_width;
5392 
5393           // Load next sub-micro block of data.
5394           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5395           input_bank_a_reg = vld1q_lane_8x4(
5396               input_data + workspace_height_stride, input_bank_a_reg, 3);
5397           input_bank_b_reg = vld1q_lane_8x4(
5398               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5399           input_bank_b_reg = vld1q_lane_8x4(
5400               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5401           input_bank_c_reg = vld1q_lane_8x4(
5402               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5403 
5404           int16x8_t acc_s16_0_1;
5405           uint8x8_t acc_u8_0_1;
5406           // Iterate over input width shifts within 4x4 blocks.
5407           {
5408             acc0 = adjusted_bias_data_s_0;
5409             acc1 = adjusted_bias_data_s_0;
5410 
5411             acc0 =
5412                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5413             acc0 =
5414                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5415             acc0 =
5416                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5417             acc1 =
5418                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5419             acc1 =
5420                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5421             acc1 =
5422                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5423 
5424             // Fixed-point multiplication.
5425             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5426             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5427                 acc0, -output_shift);
5428             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5429             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5430                 acc1, -output_shift);
5431             // Add the output offset.
5432             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5433             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5434             // Apply the activation function.
5435             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5436             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5437                                       vget_low_u8(output_activation_min_vec));
5438             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5439                                       vget_low_u8(output_activation_max_vec));
5440 
5441             vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5442             vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5443 
5444             acc0 = adjusted_bias_data_s_1;
5445             acc1 = adjusted_bias_data_s_1;
5446 
5447             acc0 =
5448                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5449             acc0 =
5450                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5451             acc0 =
5452                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5453             acc1 =
5454                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5455             acc1 =
5456                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5457             acc1 =
5458                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5459 
5460             // Fixed-point multiplication.
5461             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5462             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5463                 acc0, -output_shift);
5464             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5465             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5466                 acc1, -output_shift);
5467             // Add the output offset.
5468             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5469             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5470             // Apply the activation function.
5471             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5472             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5473                                       vget_low_u8(output_activation_min_vec));
5474             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5475                                       vget_low_u8(output_activation_max_vec));
5476 
5477             vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5478             vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5479                            1);
5480 
5481             input_bank_a_reg = vreinterpretq_s8_u64(
5482                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5483             input_bank_b_reg = vreinterpretq_s8_u64(
5484                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5485             input_bank_c_reg = vreinterpretq_s8_u64(
5486                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
5487 
5488             output_data += output_depth;
5489           }
5490         }
5491       } else {
5492         TFLITE_DCHECK_EQ(block_height, 1);
5493         // Work through one slice, by row, at a time.
5494         const int8* scratch_data = scratch_block_data;
5495         typename QuantizationTypeImpl<quantization_type>::ExternalType*
5496             output_data = output_block_data + 8 * j_depth;
5497 
5498         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
5499         int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
5500 
5501         // Load first sub-micro block of data into operational banks.
5502         input_bank_a_reg =
5503             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
5504                                            // uninitialized variable.
5505         input_bank_a_reg = vld1q_lane_8x4(
5506             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5507         input_bank_b_reg = vld1q_dup_s8x4(
5508             scratch_data +
5509             2 * workspace_height_stride);  // Load lane 0, avoiding
5510                                            // uninitialized variable.
5511 
5512         int32x4_t acc0;
5513         int32x4_t acc1;
5514 
5515         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5516              ++i_width) {
5517           const int output_width =
5518               i_width == output_width_micro_repeats ? residual_width : 2;
5519 
5520           TFLITE_DCHECK_LE(output_width, 2);
5521           TFLITE_DCHECK_GE(output_width, 1);
5522           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
5523           const int8* input_data = scratch_data + 4 + 4 * i_width;
5524 
5525           // Load next sub-micro block of data.
5526           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5527           input_bank_a_reg = vld1q_lane_8x4(
5528               input_data + workspace_height_stride, input_bank_a_reg, 3);
5529           input_bank_b_reg = vld1q_lane_8x4(
5530               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5531 
5532           int16x8_t acc_s16_0_1;
5533           uint8x8_t acc_u8_0_1;
5534 
5535           // Iterate over input width shifts within 4x4 blocks.
5536           {
5537             acc0 = adjusted_bias_data_s_0;
5538 
5539             acc0 =
5540                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5541             acc0 =
5542                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5543             acc0 =
5544                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5545 
5546             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5547             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5548                 acc0, -output_shift);
5549 
5550             // Second sub-block accumulation.
5551             acc1 = adjusted_bias_data_s_1;
5552 
5553             acc1 =
5554                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5555             acc1 =
5556                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5557             acc1 =
5558                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5559 
5560             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5561             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5562                 acc1, -output_shift);
5563 
5564             // Add the output offset.
5565             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5566             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5567             // Apply the activation function.
5568             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5569             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5570                                       vget_low_u8(output_activation_min_vec));
5571             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5572                                       vget_low_u8(output_activation_max_vec));
5573 
5574             // This stores the results for both sub-blocks together.
5575             util_vst1_u8(output_data, acc_u8_0_1);
5576 
5577             input_bank_a_reg = vreinterpretq_s8_u64(
5578                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5579             input_bank_b_reg = vreinterpretq_s8_u64(
5580                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5581 
5582             output_data += output_depth;
5583           }
5584           if (output_width == 2) {
5585             acc0 = adjusted_bias_data_s_0;
5586 
5587             acc0 =
5588                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5589             acc0 =
5590                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5591             acc0 =
5592                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5593 
5594             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5595             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5596                 acc0, -output_shift);
5597 
5598             // Second sub-block accumulation.
5599             acc1 = adjusted_bias_data_s_1;
5600 
5601             acc1 =
5602                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5603             acc1 =
5604                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5605             acc1 =
5606                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5607 
5608             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5609             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5610                 acc1, -output_shift);
5611 
5612             // Add the output offset.
5613             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5614             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5615             // Apply the activation function.
5616             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5617             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5618                                       vget_low_u8(output_activation_min_vec));
5619             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5620                                       vget_low_u8(output_activation_max_vec));
5621 
5622             // This stores the results for both sub-blocks together.
5623             util_vst1_u8(output_data, acc_u8_0_1);
5624 
5625             input_bank_a_reg = vreinterpretq_s8_u64(
5626                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5627             input_bank_b_reg = vreinterpretq_s8_u64(
5628                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5629 
5630             output_data += output_depth;
5631           }
5632         }
5633       }
5634     }
5635   }
5636 
5637   static inline void Run(const int8* scratch_block_data,
5638                          const int8* filter_workspace, const int32* bias_data,
5639                          uint8* output_block_data,
5640                          const DepthwiseConvDotProdParams* function_params) {
5641     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5642                                output_block_data, function_params);
5643   }
5644 };
5645 
5646 template <>
5647 struct KernelMacroBlock<
5648     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5649     QuantizationType::kPerChannelInt8,
5650     DepthwiseConvDepthMultiplication::kNoMultiplication,
5651     /*stride=*/1> {
5652   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
5653   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
5654     return vmin_s8(a, b);
5655   }
5656   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
5657     return vmax_s8(a, b);
5658   }
5659   static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
5660     return vminq_s8(a, b);
5661   }
5662   static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
5663     return vmaxq_s8(a, b);
5664   }
5665 
5666   static inline void KernelMacroBlockIntrinsics(
5667       const int8* scratch_block_data, const int8* filter_workspace,
5668       const int32* bias_data, int8* output_block_data,
5669       const DepthwiseConvDotProdParams* function_params) {
5670     static constexpr QuantizationType quantization_type =
5671         QuantizationType::kPerChannelInt8;
5672 
5673     const int workspace_height_stride =
5674         function_params->workspace_height_stride;
5675     const int input_width_overall_micro_repeats =
5676         function_params->input_width_overall_micro_repeats;
5677     const int output_width_micro_repeats =
5678         function_params->output_width_micro_repeats;
5679     const int depth_micro_repeats = function_params->depth_micro_repeats;
5680     const int depth = function_params->input_depth;
5681 
5682     const int output_width_overall_micro_repeats =
5683         function_params->output_width_overall_micro_repeats;
5684     const int block_height = function_params->outbound_block_height;
5685     const int residual_width = function_params->output_residual_width;
5686     const int output_height_stride = function_params->output_height_stride;
5687     constexpr int kBiasIncrement = 4;
5688 
5689     TFLITE_DCHECK(depth_micro_repeats > 0);
5690     const int width_micro_stride = 4 * 8;
5691     const int depth_micro_stride =
5692         width_micro_stride * input_width_overall_micro_repeats;
5693 
5694     const int32 output_activation_min =
5695         function_params->quantized_activation_min;
5696     const int32 output_activation_max =
5697         function_params->quantized_activation_max;
5698     const int32 output_offset = function_params->output_offset;
5699     const int32* output_shift_per_channel =
5700         function_params->output_shift_per_channel;
5701     const int32* output_multiplier_per_channel =
5702         function_params->output_multiplier_per_channel;
5703     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5704       TFLITE_DCHECK_GE(output_activation_min, 0);
5705       TFLITE_DCHECK_LT(output_activation_min, 256);
5706       TFLITE_DCHECK_GE(output_activation_max, 0);
5707       TFLITE_DCHECK_LT(output_activation_max, 256);
5708     } else {
5709       TFLITE_DCHECK_GE(output_activation_min, -128);
5710       TFLITE_DCHECK_LT(output_activation_min, 128);
5711       TFLITE_DCHECK_GE(output_activation_max, -128);
5712       TFLITE_DCHECK_LT(output_activation_max, 128);
5713       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
5714       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
5715     }
5716     TFLITE_DCHECK_GE(output_offset, -32878);
5717     TFLITE_DCHECK_LT(output_offset, 32768);
5718 
5719     const int16x8_t output_offset_vec =
5720         vdupq_n_s16(static_cast<int16>(output_offset));
5721     const int8x16_t output_activation_min_vec =
5722         vdupq_n_s8(static_cast<int8>(output_activation_min));
5723     const int8x16_t output_activation_max_vec =
5724         vdupq_n_s8(static_cast<int8>(output_activation_max));
5725 
5726     const int8* input_data_depthwise = scratch_block_data;
5727     typename QuantizationTypeImpl<quantization_type>::ExternalType*
5728         output_data_depthwise = output_block_data;
5729     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
5730       // Simulate NEON-register transposition of subset of filter.
5731       int8x16_t filter_reg_0_a;
5732       int8x16_t filter_reg_0_b;
5733       int8x16_t filter_reg_1_a;
5734       int8x16_t filter_reg_1_b;
5735       int8x16_t filter_reg_2_a;
5736       int8x16_t filter_reg_2_b;
5737       int8x16_t filter_reg_0_a_shifted;
5738       int8x16_t filter_reg_1_a_shifted;
5739       int8x16_t filter_reg_2_a_shifted;
5740 
5741       filter_reg_0_a = vld1q_s8(filter_workspace);
5742       filter_workspace += 16;
5743       filter_reg_0_b = vld1q_s8(filter_workspace);
5744       filter_workspace += 16;
5745       filter_reg_1_a = vld1q_s8(filter_workspace);
5746       filter_workspace += 16;
5747       filter_reg_1_b = vld1q_s8(filter_workspace);
5748       filter_workspace += 16;
5749       filter_reg_2_a = vld1q_s8(filter_workspace);
5750       filter_workspace += 16;
5751       filter_reg_2_b = vld1q_s8(filter_workspace);
5752       filter_workspace += 16;
5753 
5754       filter_reg_0_a_shifted = vreinterpretq_s8_u32(
5755           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
5756       filter_reg_1_a_shifted = vreinterpretq_s8_u32(
5757           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
5758       filter_reg_2_a_shifted = vreinterpretq_s8_u32(
5759           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
5760 
5761       if (block_height == 4) {
5762         for (int s = 0; s < 2; ++s) {
5763           // Work through one slice, by row, at a time.
5764           const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
5765           typename QuantizationTypeImpl<quantization_type>::ExternalType*
5766               output_data_base = output_data_depthwise + 4 * s;
5767 
5768           const int8* next_input_data = input_data_base;
5769           typename QuantizationTypeImpl<quantization_type>::ExternalType*
5770               output_data = output_data_base;
5771 
5772           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
5773           bias_data += kBiasIncrement;
5774 
5775           const int32x4_t output_shift =
5776               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
5777           const int32x4_t output_multiplier =
5778               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
5779 
5780           // Load first sub-micro block of data into operational banks.
5781           int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
5782           int8x16_t left_bank_1_reg =
5783               vld1q_s8(next_input_data + workspace_height_stride);
5784           int8x16_t left_bank_2_reg =
5785               vld1q_s8(next_input_data + 2 * workspace_height_stride);
5786           int8x16_t left_bank_3_reg =
5787               vld1q_s8(next_input_data + 3 * workspace_height_stride);
5788           int8x16_t left_bank_4_reg =
5789               vld1q_s8(next_input_data + 4 * workspace_height_stride);
5790           int8x16_t left_bank_5_reg =
5791               vld1q_s8(next_input_data + 5 * workspace_height_stride);
5792 
5793           int32x4_t acc0;
5794           int32x4_t acc1;
5795           int32x4_t acc2;
5796           int32x4_t acc3;
5797 
5798           acc0 = adjusted_bias_data;
5799           acc1 = adjusted_bias_data;
5800           acc2 = adjusted_bias_data;
5801           acc3 = adjusted_bias_data;
5802 
5803           acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5804           acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5805           acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5806           acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5807 
5808           for (int i_width = 0; i_width < output_width_micro_repeats;
5809                ++i_width) {
5810             next_input_data += width_micro_stride;
5811 
5812             // Iterate over input width shifts within 4x4 blocks.
5813             {
5814               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5815               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5816               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5817               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5818               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5819               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5820               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5821               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5822 
5823               // Fixed-point multiplication.
5824               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5825               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5826                   acc0, output_shift);
5827               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5828               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5829                   acc1, output_shift);
5830               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5831               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5832                   acc2, output_shift);
5833               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5834               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5835                   acc3, output_shift);
5836               // Add the output offset.
5837               int16x8_t acc_s16_0_1 =
5838                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5839               int16x8_t acc_s16_2_3 =
5840                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5841               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5842               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5843               // Apply the activation function.
5844               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5845                                                  vqmovxn_s16(acc_s16_2_3));
5846               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5847               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5848 
5849               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
5850               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
5851                               1);
5852               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
5853                               acc_u8_all, 2);
5854               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
5855                               acc_u8_all, 3);
5856 
5857               output_data += depth;
5858             }
5859 
5860             // Load next sub-micro block of data.
5861             int8x16_t right_bank_0_reg;
5862             int8x16_t right_bank_1_reg;
5863             int8x16_t right_bank_2_reg;
5864             int8x16_t right_bank_3_reg;
5865             int8x16_t right_bank_4_reg;
5866             int8x16_t right_bank_5_reg;
5867 
5868             // Loading of next block always valid.
5869             right_bank_0_reg = vld1q_s8(next_input_data);
5870             right_bank_1_reg =
5871                 vld1q_s8(next_input_data + workspace_height_stride);
5872             right_bank_2_reg =
5873                 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5874             right_bank_3_reg =
5875                 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5876             right_bank_4_reg =
5877                 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5878             right_bank_5_reg =
5879                 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5880 
5881             {
5882               acc0 = adjusted_bias_data;
5883               acc1 = adjusted_bias_data;
5884               acc2 = adjusted_bias_data;
5885               acc3 = adjusted_bias_data;
5886 
5887               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5888               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5889               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5890               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5891               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5892               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5893               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5894               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5895               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5896               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5897               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5898               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5899 
5900               // Fixed-point multiplication.
5901               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5902               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5903                   acc0, output_shift);
5904               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5905               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5906                   acc1, output_shift);
5907               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5908               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5909                   acc2, output_shift);
5910               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5911               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5912                   acc3, output_shift);
5913               // Add the output offset.
5914               int16x8_t acc_s16_0_1 =
5915                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5916               int16x8_t acc_s16_2_3 =
5917                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5918               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5919               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5920               // Apply the activation function.
5921               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5922                                                  vqmovxn_s16(acc_s16_2_3));
5923               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5924               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5925 
5926               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
5927               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
5928                               1);
5929               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
5930                               acc_u8_all, 2);
5931               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
5932                               acc_u8_all, 3);
5933 
5934               left_bank_0_reg = vreinterpretq_s8_u16(
5935                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
5936               left_bank_1_reg = vreinterpretq_s8_u16(
5937                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
5938               left_bank_2_reg = vreinterpretq_s8_u16(
5939                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
5940               left_bank_3_reg = vreinterpretq_s8_u16(
5941                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
5942               left_bank_4_reg = vreinterpretq_s8_u16(
5943                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
5944               left_bank_5_reg = vreinterpretq_s8_u16(
5945                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
5946               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
5947               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
5948               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
5949               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
5950               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
5951               vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
5952 
5953               output_data += depth;
5954             }
5955 
5956             {
5957               acc0 = adjusted_bias_data;
5958               acc1 = adjusted_bias_data;
5959               acc2 = adjusted_bias_data;
5960               acc3 = adjusted_bias_data;
5961 
5962               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5963               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5964               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5965               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5966               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5967               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5968               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5969               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5970               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5971               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5972               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5973               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5974 
5975               // Fixed-point multiplication.
5976               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5977               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5978                   acc0, output_shift);
5979               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5980               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5981                   acc1, output_shift);
5982               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5983               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5984                   acc2, output_shift);
5985               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5986               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5987                   acc3, output_shift);
5988               // Add the output offset.
5989               int16x8_t acc_s16_0_1 =
5990                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5991               int16x8_t acc_s16_2_3 =
5992                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5993               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5994               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5995               // Apply the activation function.
5996               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5997                                                  vqmovxn_s16(acc_s16_2_3));
5998               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5999               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6000 
6001               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6002               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6003                               1);
6004               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6005                               acc_u8_all, 2);
6006               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6007                               acc_u8_all, 3);
6008 
6009               output_data += depth;
6010             }
6011 
6012             {
6013               acc0 = adjusted_bias_data;
6014               acc1 = adjusted_bias_data;
6015               acc2 = adjusted_bias_data;
6016               acc3 = adjusted_bias_data;
6017 
6018               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
6019               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
6020               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
6021               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
6022               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
6023               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
6024               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
6025               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
6026               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
6027               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
6028               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
6029               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
6030 
6031               // Fixed-point multiplication.
6032               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6033               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6034                   acc0, output_shift);
6035               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6036               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6037                   acc1, output_shift);
6038               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6039               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6040                   acc2, output_shift);
6041               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6042               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6043                   acc3, output_shift);
6044               // Add the output offset.
6045               int16x8_t acc_s16_0_1 =
6046                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6047               int16x8_t acc_s16_2_3 =
6048                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6049               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6050               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6051               // Apply the activation function.
6052               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
6053                                                  vqmovxn_s16(acc_s16_2_3));
6054               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6055               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6056 
6057               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6058               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6059                               1);
6060               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6061                               acc_u8_all, 2);
6062               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6063                               acc_u8_all, 3);
6064 
6065               left_bank_0_reg = right_bank_0_reg;
6066               left_bank_1_reg = right_bank_1_reg;
6067               left_bank_2_reg = right_bank_2_reg;
6068               left_bank_3_reg = right_bank_3_reg;
6069               left_bank_4_reg = right_bank_4_reg;
6070               left_bank_5_reg = right_bank_5_reg;
6071 
6072               output_data += depth;
6073               acc0 = adjusted_bias_data;
6074               acc1 = adjusted_bias_data;
6075               acc2 = adjusted_bias_data;
6076               acc3 = adjusted_bias_data;
6077 
6078               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6079               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6080               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6081               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6082             }
6083           }
6084 
6085           if (residual_width > 0) {
6086             next_input_data += width_micro_stride;
6087             const int output_width = residual_width;
6088 
6089             // Load next sub-micro block of data.
6090             int8x16_t right_bank_0_reg;
6091             int8x16_t right_bank_1_reg;
6092             int8x16_t right_bank_2_reg;
6093             int8x16_t right_bank_3_reg;
6094             int8x16_t right_bank_4_reg;
6095             int8x16_t right_bank_5_reg;
6096             // Logic: (output_width - 1) * stride_val < 2.
6097             const bool no_right_block = output_width < 3;
6098 
6099             if (no_right_block) {
6100               // Only needed for sanitizer checks.
6101               right_bank_0_reg = vdupq_n_s8(0);
6102               right_bank_1_reg = vdupq_n_s8(0);
6103               right_bank_2_reg = vdupq_n_s8(0);
6104               right_bank_3_reg = vdupq_n_s8(0);
6105               right_bank_4_reg = vdupq_n_s8(0);
6106               right_bank_5_reg = vdupq_n_s8(0);
6107             } else {
6108               right_bank_0_reg = vld1q_s8(next_input_data);
6109               right_bank_1_reg =
6110                   vld1q_s8(next_input_data + workspace_height_stride);
6111               right_bank_2_reg =
6112                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
6113               right_bank_3_reg =
6114                   vld1q_s8(next_input_data + 3 * workspace_height_stride);
6115               right_bank_4_reg =
6116                   vld1q_s8(next_input_data + 4 * workspace_height_stride);
6117               right_bank_5_reg =
6118                   vld1q_s8(next_input_data + 5 * workspace_height_stride);
6119             }
6120 
6121             // Iterate over input width shifts within 4x4 blocks.
6122             for (int x = 0; x < output_width; ++x) {
6123               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6124               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6125               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
6126               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
6127               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
6128               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
6129               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
6130               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
6131 
6132               // Fixed-point multiplication.
6133               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6134               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6135                   acc0, output_shift);
6136               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6137               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6138                   acc1, output_shift);
6139               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6140               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6141                   acc2, output_shift);
6142               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6143               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6144                   acc3, output_shift);
6145               // Add the output offset.
6146               int16x8_t acc_s16_0_1 =
6147                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6148               int16x8_t acc_s16_2_3 =
6149                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6150               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6151               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6152               // Apply the activation function.
6153               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
6154                                                  vqmovxn_s16(acc_s16_2_3));
6155               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6156               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6157 
6158               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6159               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6160                               1);
6161               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6162                               acc_u8_all, 2);
6163               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6164                               acc_u8_all, 3);
6165 
6166               biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
6167               biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
6168               biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
6169               biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
6170               biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
6171               biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
6172 
6173               output_data += depth;
6174 
6175               acc0 = adjusted_bias_data;
6176               acc1 = adjusted_bias_data;
6177               acc2 = adjusted_bias_data;
6178               acc3 = adjusted_bias_data;
6179 
6180               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6181               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6182               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6183               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6184             }
6185           }
6186           input_data_base += 4 * workspace_height_stride;
6187           output_data_base += 4 * output_height_stride;
6188 
6189           // Move to next sub-block: advance to second set of filters, to new
6190           // bias.
6191           filter_reg_0_a = filter_reg_0_b;
6192           filter_reg_1_a = filter_reg_1_b;
6193           filter_reg_2_a = filter_reg_2_b;
6194           filter_reg_0_a_shifted = vreinterpretq_s8_u32(
6195               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
6196           filter_reg_1_a_shifted = vreinterpretq_s8_u32(
6197               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
6198           filter_reg_2_a_shifted = vreinterpretq_s8_u32(
6199               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
6200         }
6201       } else {
6202         const int8* input_data_base = input_data_depthwise;
6203         typename QuantizationTypeImpl<quantization_type>::ExternalType*
6204             output_data_base = output_data_depthwise;
6205 
6206         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6207         bias_data += kBiasIncrement;
6208         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6209         bias_data += kBiasIncrement;
6210 
6211         const int32x4_t output_shift_a =
6212             vld1q_s32(output_shift_per_channel + j_depth * 8);
6213         const int32x4_t output_multiplier_a =
6214             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6215         const int32x4_t output_shift_b =
6216             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6217         const int32x4_t output_multiplier_b =
6218             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6219 
6220         for (int k_height = 0; k_height < block_height; ++k_height) {
6221           const int8* next_input_data = input_data_base;
6222           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6223               output_data = output_data_base;
6224 
6225           // Load first sub-micro block of data into operational banks.
6226           int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
6227           int8x16_t left_bank_1_reg_a =
6228               vld1q_s8(next_input_data + workspace_height_stride);
6229           int8x16_t left_bank_2_reg_a =
6230               vld1q_s8(next_input_data + 2 * workspace_height_stride);
6231           int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6232           int8x16_t left_bank_1_reg_b =
6233               vld1q_s8(next_input_data + workspace_height_stride + 16);
6234           int8x16_t left_bank_2_reg_b =
6235               vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6236 
6237           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6238                ++i_width) {
6239             next_input_data += width_micro_stride;
6240             const int output_width =
6241                 i_width == output_width_micro_repeats ? residual_width : 4;
6242 
6243             int8x16_t right_bank_0_reg_a;
6244             int8x16_t right_bank_1_reg_a;
6245             int8x16_t right_bank_2_reg_a;
6246             int8x16_t right_bank_0_reg_b;
6247             int8x16_t right_bank_1_reg_b;
6248             int8x16_t right_bank_2_reg_b;
6249             // Logic: (output_width - 1) * stride_val < 2.
6250             const bool no_right_block = output_width < 3;
6251 
6252             // Load next sub-micro block of data.
6253             if (no_right_block) {
6254               // Only needed for sanitizer checks.
6255               right_bank_0_reg_a = vdupq_n_s8(0);
6256               right_bank_1_reg_a = vdupq_n_s8(0);
6257               right_bank_2_reg_a = vdupq_n_s8(0);
6258               right_bank_0_reg_b = vdupq_n_s8(0);
6259               right_bank_1_reg_b = vdupq_n_s8(0);
6260               right_bank_2_reg_b = vdupq_n_s8(0);
6261             } else {
6262               right_bank_0_reg_a = vld1q_s8(next_input_data);
6263               right_bank_1_reg_a =
6264                   vld1q_s8(next_input_data + workspace_height_stride);
6265               right_bank_2_reg_a =
6266                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
6267               right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6268               right_bank_1_reg_b =
6269                   vld1q_s8(next_input_data + workspace_height_stride + 16);
6270               right_bank_2_reg_b =
6271                   vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6272             }
6273 
6274             // Iterate over input width shifts within 4x4 blocks.
6275             for (int x = 0; x < output_width; ++x) {
6276               int32x4_t acc_a = adjusted_bias_data_a;
6277               int32x4_t acc_b = adjusted_bias_data_b;
6278               acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
6279               acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
6280               acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
6281               acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
6282               acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
6283               acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
6284 
6285               // Fixed-point multiplication.
6286               acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
6287               acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
6288               acc_a =
6289                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6290                       acc_a, output_shift_a);
6291               acc_b =
6292                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6293                       acc_b, output_shift_b);
6294               // Add the output offset.
6295               int16x8_t acc_s16_0_0 =
6296                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
6297               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
6298               // Apply the activation function.
6299               int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
6300               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
6301                                         vget_low_s8(output_activation_min_vec));
6302               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
6303                                         vget_low_s8(output_activation_max_vec));
6304 
6305               vst1_s8(output_data, acc_u8_0_0);
6306 
6307               biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
6308               biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
6309               biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
6310               biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
6311               biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
6312               biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
6313 
6314               output_data += depth;
6315             }
6316           }
6317           input_data_base += workspace_height_stride;
6318           output_data_base += output_height_stride;
6319         }
6320       }
6321       input_data_depthwise += depth_micro_stride;
6322       output_data_depthwise += 8;
6323     }
6324   }  // NOLINT(readability/fn_size) Manually unrolled.
6325 
6326   static inline void Run(const int8* scratch_block_data,
6327                          const int8* filter_workspace, const int32* bias_data,
6328                          int8* output_block_data,
6329                          const DepthwiseConvDotProdParams* function_params) {
6330     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6331                                output_block_data, function_params);
6332   }
6333 };
6334 
6335 template <>
6336 struct KernelMacroBlock<
6337     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6338     QuantizationType::kPerChannelInt8,
6339     DepthwiseConvDepthMultiplication::kNoMultiplication,
6340     /*stride=*/2> {
6341   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6342   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6343     return vmin_s8(a, b);
6344   }
6345   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6346     return vmax_s8(a, b);
6347   }
6348 
6349   static inline void KernelMacroBlockIntrinsics(
6350       const int8* scratch_block_data, const int8* filter_workspace,
6351       const int32* bias_data, int8* output_block_data,
6352       const DepthwiseConvDotProdParams* function_params) {
6353     static constexpr QuantizationType quantization_type =
6354         QuantizationType::kPerChannelInt8;
6355 
6356     const int workspace_height_stride =
6357         function_params->workspace_height_stride;
6358     const int input_width_overall_micro_repeats =
6359         function_params->input_width_overall_micro_repeats;
6360     const int output_width_micro_repeats =
6361         function_params->output_width_micro_repeats;
6362     const int depth_micro_repeats = function_params->depth_micro_repeats;
6363     const int depth = function_params->input_depth;
6364     constexpr int kStrideVal = 2;
6365     constexpr int kFourOverStride = 2;
6366     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
6367     TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
6368 
6369     const int workspace_width_micro_repeats =
6370         function_params->workspace_width_micro_repeats;
6371     const int output_width_overall_micro_repeats =
6372         function_params->output_width_overall_micro_repeats;
6373     const int block_height = function_params->outbound_block_height;
6374     const int residual_width = function_params->output_residual_width;
6375     const int output_height_stride = function_params->output_height_stride;
6376     constexpr int kBiasIncrement = 4;
6377 
6378     TFLITE_DCHECK(depth_micro_repeats > 0);
6379     const int width_micro_stride = 4 * 8;
6380     const int depth_micro_stride =
6381         width_micro_stride * input_width_overall_micro_repeats;
6382 
6383     const int32 output_activation_min =
6384         function_params->quantized_activation_min;
6385     const int32 output_activation_max =
6386         function_params->quantized_activation_max;
6387     const int32 output_offset = function_params->output_offset;
6388     const int32* output_shift_per_channel =
6389         function_params->output_shift_per_channel;
6390     const int32* output_multiplier_per_channel =
6391         function_params->output_multiplier_per_channel;
6392     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6393       TFLITE_DCHECK_GE(output_activation_min, 0);
6394       TFLITE_DCHECK_LT(output_activation_min, 256);
6395       TFLITE_DCHECK_GE(output_activation_max, 0);
6396       TFLITE_DCHECK_LT(output_activation_max, 256);
6397     } else {
6398       TFLITE_DCHECK_GE(output_activation_min, -128);
6399       TFLITE_DCHECK_LT(output_activation_min, 128);
6400       TFLITE_DCHECK_GE(output_activation_max, -128);
6401       TFLITE_DCHECK_LT(output_activation_max, 128);
6402       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6403       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6404     }
6405     TFLITE_DCHECK_GE(output_offset, -32878);
6406     TFLITE_DCHECK_LT(output_offset, 32768);
6407 
6408     // This version only does min/max on 64 bits.
6409     const int16x8_t output_offset_vec =
6410         vdupq_n_s16(static_cast<int16>(output_offset));
6411     const int8x8_t output_activation_min_vec =
6412         vdup_n_s8(static_cast<int8>(output_activation_min));
6413     const int8x8_t output_activation_max_vec =
6414         vdup_n_s8(static_cast<int8>(output_activation_max));
6415 
6416     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
6417 
6418     TFLITE_DCHECK_LE(block_height, 2);
6419 
6420     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6421       const int8* filter_block =
6422           filter_workspace + shuffled_filter_increment * j_depth;
6423 
6424       if (block_height == 2) {
6425         for (int s = 0; s < 2; ++s) {
6426           // Simulate NEON-register transposition of subset of filter.
6427           int8x16_t filter_reg_0_a;
6428           int8x16_t filter_reg_1_a;
6429           int8x16_t filter_reg_2_a;
6430 
6431           filter_reg_0_a = vld1q_s8(filter_block + s * 16);
6432           filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
6433           filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
6434 
6435           const int8* scratch_data =
6436               scratch_block_data + depth_micro_stride * j_depth;
6437           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6438               output_data = output_block_data + 8 * j_depth;
6439           const int8* input_data_0 = scratch_data + s * 2 * 8;
6440 
6441           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6442 
6443           const int32x4_t output_shift =
6444               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6445           const int32x4_t output_multiplier =
6446               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6447 
6448           // Load first sub-micro block of data into operational banks.
6449           int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
6450           int8x16_t left_bank_1_reg =
6451               vld1q_s8(input_data_0 + workspace_height_stride);
6452           int8x16_t left_bank_2_reg =
6453               vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6454           int8x16_t left_bank_3_reg =
6455               vld1q_s8(input_data_0 + 3 * workspace_height_stride);
6456           int8x16_t left_bank_4_reg =
6457               vld1q_s8(input_data_0 + 4 * workspace_height_stride);
6458 
6459           int8x16_t right_bank_0_reg;
6460           int8x16_t right_bank_1_reg;
6461           int8x16_t right_bank_2_reg;
6462           int8x16_t right_bank_3_reg;
6463           int8x16_t right_bank_4_reg;
6464 
6465           int32x4_t acc0;
6466           int32x4_t acc1;
6467           int16x8_t acc_s16_0_1;
6468           int8x8_t acc_u8;
6469 
6470           int i_width = 0;
6471 
6472           // When output_width_micro_repeats <
6473           // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
6474           // residual_width == 1 is then true iff residual_width < 2.
6475           const int adjusted_width_micro_repeats =
6476               (output_width_micro_repeats <
6477                output_width_overall_micro_repeats) &&
6478                       (residual_width == 1)
6479                   ? output_width_micro_repeats
6480                   : output_width_overall_micro_repeats;
6481 
6482           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6483             const int output_width = kFourOverStride;
6484             TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6485             const int8* input_data =
6486                 input_data_0 + width_micro_stride * i_width;
6487             acc0 = adjusted_bias_data;
6488             acc1 = adjusted_bias_data;
6489             right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
6490             right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
6491                                         workspace_height_stride);
6492 
6493             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6494             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6495             typename QuantizationTypeImpl<quantization_type>::ExternalType*
6496                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6497 
6498             right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
6499                                         2 * workspace_height_stride);
6500             right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
6501                                         3 * workspace_height_stride);
6502             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6503             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6504             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6505             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6506             right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
6507                                         4 * workspace_height_stride);
6508 
6509             // Fixed-point multiplication.
6510             acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6511             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6512                 acc0, output_shift);
6513             acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6514             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6515                 acc1, output_shift);
6516             // Add the output offset.
6517             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6518             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6519             // Apply the activation function.
6520             acc_u8 = vqmovxn_s16(acc_s16_0_1);
6521             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6522             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6523 
6524             left_bank_0_reg = vreinterpretq_s8_u16(
6525                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
6526             left_bank_1_reg = vreinterpretq_s8_u16(
6527                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
6528             left_bank_2_reg = vreinterpretq_s8_u16(
6529                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
6530             left_bank_3_reg = vreinterpretq_s8_u16(
6531                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
6532             left_bank_4_reg = vreinterpretq_s8_u16(
6533                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
6534             acc0 = adjusted_bias_data;
6535             acc1 = adjusted_bias_data;
6536             vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6537             vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6538             vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6539             vst1_lane_s8x4(output_data_base, acc_u8, 0);
6540             vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8, 1);
6541 
6542             vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6543             vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6544 
6545             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6546             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6547             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6548             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6549             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6550             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6551 
6552             // Fixed-point multiplication.
6553             acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6554             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6555                 acc0, output_shift);
6556             acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6557             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6558                 acc1, output_shift);
6559             // Add the output offset.
6560             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6561             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6562             // Apply the activation function.
6563             acc_u8 = vqmovxn_s16(acc_s16_0_1);
6564             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6565             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6566 
6567             vst1_lane_s8x4(output_data_base + depth, acc_u8, 0);
6568             vst1_lane_s8x4(output_data_base + depth + output_height_stride,
6569                            acc_u8, 1);
6570 
6571             left_bank_0_reg = right_bank_0_reg;
6572             left_bank_1_reg = right_bank_1_reg;
6573             left_bank_2_reg = right_bank_2_reg;
6574             left_bank_3_reg = right_bank_3_reg;
6575             left_bank_4_reg = right_bank_4_reg;
6576           }
6577           for (; i_width < output_width_overall_micro_repeats; ++i_width) {
6578             TFLITE_DCHECK_NE(residual_width, kFourOverStride);
6579 
6580             // No need to load next ("right") block of data.
6581 
6582             typename QuantizationTypeImpl<quantization_type>::ExternalType*
6583                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6584 
6585             // Iterate over input width shifts within 4x4 blocks.
6586             {
6587               acc0 = adjusted_bias_data;
6588               acc1 = adjusted_bias_data;
6589 
6590               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6591               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6592               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6593               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6594               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6595               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6596 
6597               // Fixed-point multiplication.
6598               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6599               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6600                   acc0, output_shift);
6601               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6602               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6603                   acc1, output_shift);
6604               // Add the output offset.
6605               int16x8_t acc_s16_0_1 =
6606                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6607               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6608               // Apply the activation function.
6609               int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6610               acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6611               acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6612 
6613               vst1_lane_s8x4(output_data_base, acc_u8, 0);
6614               vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8,
6615                              1);
6616 
6617               left_bank_0_reg = vreinterpretq_s8_u16(
6618                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
6619               left_bank_1_reg = vreinterpretq_s8_u16(
6620                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
6621               left_bank_2_reg = vreinterpretq_s8_u16(
6622                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
6623               left_bank_3_reg = vreinterpretq_s8_u16(
6624                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
6625               left_bank_4_reg = vreinterpretq_s8_u16(
6626                   vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
6627               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6628               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6629               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6630               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6631               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6632             }
6633           }
6634           bias_data += kBiasIncrement;
6635         }
6636       } else {
6637         // block_height == 1.
6638         int8x16_t filter_reg_0_a;
6639         int8x16_t filter_reg_1_a;
6640         int8x16_t filter_reg_2_a;
6641         int8x16_t filter_reg_0_b;
6642         int8x16_t filter_reg_1_b;
6643         int8x16_t filter_reg_2_b;
6644 
6645         filter_reg_0_a = vld1q_s8(filter_block);
6646         filter_reg_1_a = vld1q_s8(filter_block + 32);
6647         filter_reg_2_a = vld1q_s8(filter_block + 64);
6648         filter_reg_0_b = vld1q_s8(filter_block + 16);
6649         filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
6650         filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
6651 
6652         const int8* scratch_data =
6653             scratch_block_data + depth_micro_stride * j_depth;
6654         typename QuantizationTypeImpl<quantization_type>::ExternalType*
6655             output_data = output_block_data + 8 * j_depth;
6656         const int8* input_data_0 = scratch_data;
6657 
6658         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6659         bias_data += kBiasIncrement;
6660         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6661         bias_data += kBiasIncrement;
6662 
6663         const int32x4_t output_shift_a =
6664             vld1q_s32(output_shift_per_channel + j_depth * 8);
6665         const int32x4_t output_multiplier_a =
6666             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6667         const int32x4_t output_shift_b =
6668             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6669         const int32x4_t output_multiplier_b =
6670             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6671 
6672         // Load first sub-micro block of data into operational banks.
6673         int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
6674         int8x16_t left_bank_1_reg_a =
6675             vld1q_s8(input_data_0 + workspace_height_stride);
6676         int8x16_t left_bank_2_reg_a =
6677             vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6678         int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
6679         int8x16_t left_bank_1_reg_b =
6680             vld1q_s8(input_data_0 + workspace_height_stride + 16);
6681         int8x16_t left_bank_2_reg_b =
6682             vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
6683 
6684         int8x16_t right_bank_0_reg_a;
6685         int8x16_t right_bank_1_reg_a;
6686         int8x16_t right_bank_2_reg_a;
6687         int8x16_t right_bank_0_reg_b;
6688         int8x16_t right_bank_1_reg_b;
6689         int8x16_t right_bank_2_reg_b;
6690 
6691         int32x4_t acc0_a;
6692         int32x4_t acc0_b;
6693 
6694         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6695              ++i_width) {
6696           const int output_width = i_width == output_width_micro_repeats
6697                                        ? residual_width
6698                                        : kFourOverStride;
6699           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6700           const int8* input_data = input_data_0 + width_micro_stride * i_width;
6701           const bool no_right_block = i_width == output_width_micro_repeats &&
6702                                       output_width_overall_micro_repeats ==
6703                                           workspace_width_micro_repeats;
6704 
6705           if (!no_right_block) {
6706             // Load next sub-micro block of data.
6707             right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
6708             right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
6709                                           workspace_height_stride);
6710             right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
6711                                           2 * workspace_height_stride);
6712             right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
6713             right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
6714                                           workspace_height_stride + 16);
6715             right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
6716                                           2 * workspace_height_stride + 16);
6717           }
6718 
6719           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6720               output_data_base = output_data + depth * 2 * i_width;
6721 
6722           // Iterate over input width shifts within 4x4 blocks.
6723           {
6724             acc0_a = adjusted_bias_data_a;
6725             acc0_b = adjusted_bias_data_b;
6726 
6727             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6728             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6729             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6730             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6731             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6732             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6733 
6734             // Fixed-point multiplication.
6735             acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6736             acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6737             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6738                 acc0_a, output_shift_a);
6739             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6740                 acc0_b, output_shift_b);
6741             // Add the output offset.
6742             int16x8_t acc_s16_0_1 =
6743                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6744             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6745             // Apply the activation function.
6746             int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6747             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6748             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6749 
6750             vst1_s8(output_data_base, acc_u8);
6751 
6752             left_bank_0_reg_a = vreinterpretq_s8_u16(
6753                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
6754             left_bank_1_reg_a = vreinterpretq_s8_u16(
6755                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
6756             left_bank_2_reg_a = vreinterpretq_s8_u16(
6757                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
6758             left_bank_0_reg_b = vreinterpretq_s8_u16(
6759                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
6760             left_bank_1_reg_b = vreinterpretq_s8_u16(
6761                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
6762             left_bank_2_reg_b = vreinterpretq_s8_u16(
6763                 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
6764             vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
6765             vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
6766             vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
6767             vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
6768             vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
6769             vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
6770           }
6771 
6772           if (output_width > 1) {
6773             acc0_a = adjusted_bias_data_a;
6774             acc0_b = adjusted_bias_data_b;
6775 
6776             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6777             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6778             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6779             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6780             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6781             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6782 
6783             // Fixed-point multiplication.
6784             acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6785             acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6786             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6787                 acc0_a, output_shift_a);
6788             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6789                 acc0_b, output_shift_b);
6790             // Add the output offset.
6791             int16x8_t acc_s16_0_1 =
6792                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6793             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6794             // Apply the activation function.
6795             int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6796             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6797             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6798 
6799             vst1_s8(output_data_base + depth, acc_u8);
6800 
6801             left_bank_0_reg_a = right_bank_0_reg_a;
6802             left_bank_1_reg_a = right_bank_1_reg_a;
6803             left_bank_2_reg_a = right_bank_2_reg_a;
6804             left_bank_0_reg_b = right_bank_0_reg_b;
6805             left_bank_1_reg_b = right_bank_1_reg_b;
6806             left_bank_2_reg_b = right_bank_2_reg_b;
6807           }
6808         }
6809       }
6810     }
6811   }  // NOLINT(readability/fn_size) Manually unrolled.
6812 
6813   static inline void Run(const int8* scratch_block_data,
6814                          const int8* filter_workspace, const int32* bias_data,
6815                          int8* output_block_data,
6816                          const DepthwiseConvDotProdParams* function_params) {
6817     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6818                                output_block_data, function_params);
6819   }
6820 };
6821 
6822 template <>
6823 struct KernelMacroBlock<
6824     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6825     QuantizationType::kPerChannelInt8,
6826     DepthwiseConvDepthMultiplication::kUnitInputDepth,
6827     /*stride=*/1> {
6828   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6829   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6830     return vmin_s8(a, b);
6831   }
6832   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6833     return vmax_s8(a, b);
6834   }
6835   static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
6836     return vminq_s8(a, b);
6837   }
6838   static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
6839     return vmaxq_s8(a, b);
6840   }
6841 
6842   static inline void KernelMacroBlockIntrinsics(
6843       const int8* scratch_block_data, const int8* filter_workspace,
6844       const int32* bias_data, int8* output_block_data,
6845       const DepthwiseConvDotProdParams* function_params) {
6846     static constexpr QuantizationType quantization_type =
6847         QuantizationType::kPerChannelInt8;
6848 
6849     TFLITE_DCHECK_EQ(function_params->stride, 1);
6850     const int workspace_height_stride =
6851         function_params->workspace_height_stride;
6852     const int output_width_micro_repeats =
6853         function_params->output_width_micro_repeats;
6854     const int depth_micro_repeats = function_params->depth_micro_repeats;
6855     const int output_depth = function_params->output_depth;
6856 
6857     const int output_width_overall_micro_repeats =
6858         function_params->output_width_overall_micro_repeats;
6859     const int block_height = function_params->outbound_block_height;
6860     const int residual_width = function_params->output_residual_width;
6861     const int output_height_stride = function_params->output_height_stride;
6862     constexpr int kBiasIncrement = 4;
6863 
6864     TFLITE_DCHECK(depth_micro_repeats > 0);
6865 
6866     const int32 output_activation_min =
6867         function_params->quantized_activation_min;
6868     const int32 output_activation_max =
6869         function_params->quantized_activation_max;
6870     const int32 output_offset = function_params->output_offset;
6871     const int32* output_shift_per_channel =
6872         function_params->output_shift_per_channel;
6873     const int32* output_multiplier_per_channel =
6874         function_params->output_multiplier_per_channel;
6875     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6876       TFLITE_DCHECK_GE(output_activation_min, 0);
6877       TFLITE_DCHECK_LT(output_activation_min, 256);
6878       TFLITE_DCHECK_GE(output_activation_max, 0);
6879       TFLITE_DCHECK_LT(output_activation_max, 256);
6880     } else {
6881       TFLITE_DCHECK_GE(output_activation_min, -128);
6882       TFLITE_DCHECK_LT(output_activation_min, 128);
6883       TFLITE_DCHECK_GE(output_activation_max, -128);
6884       TFLITE_DCHECK_LT(output_activation_max, 128);
6885       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6886       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6887     }
6888     TFLITE_DCHECK_GE(output_offset, -32878);
6889     TFLITE_DCHECK_LT(output_offset, 32768);
6890 
6891     const int16x8_t output_offset_vec =
6892         vdupq_n_s16(static_cast<int16>(output_offset));
6893     const int8x16_t output_activation_min_vec =
6894         vdupq_n_s8(static_cast<int8>(output_activation_min));
6895     const int8x16_t output_activation_max_vec =
6896         vdupq_n_s8(static_cast<int8>(output_activation_max));
6897 
6898     typename QuantizationTypeImpl<quantization_type>::ExternalType*
6899         output_data_depthwise = output_block_data;
6900     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6901       // Simulate NEON-register transposition of subset of filter.
6902       int8x16_t filter_reg_0_a;
6903       int8x16_t filter_reg_0_b;
6904       int8x16_t filter_reg_1_a;
6905       int8x16_t filter_reg_1_b;
6906       int8x16_t filter_reg_2_a;
6907       int8x16_t filter_reg_2_b;
6908       int8x16_t filter_reg_0_a_shifted;
6909       int8x16_t filter_reg_1_a_shifted;
6910       int8x16_t filter_reg_2_a_shifted;
6911 
6912       filter_reg_0_a = vld1q_s8(filter_workspace);
6913       filter_workspace += 16;
6914       filter_reg_0_b = vld1q_s8(filter_workspace);
6915       filter_workspace += 16;
6916       filter_reg_1_a = vld1q_s8(filter_workspace);
6917       filter_workspace += 16;
6918       filter_reg_1_b = vld1q_s8(filter_workspace);
6919       filter_workspace += 16;
6920       filter_reg_2_a = vld1q_s8(filter_workspace);
6921       filter_workspace += 16;
6922       filter_reg_2_b = vld1q_s8(filter_workspace);
6923       filter_workspace += 16;
6924 
6925       filter_reg_0_a_shifted = vreinterpretq_s8_u32(
6926           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
6927       filter_reg_1_a_shifted = vreinterpretq_s8_u32(
6928           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
6929       filter_reg_2_a_shifted = vreinterpretq_s8_u32(
6930           vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
6931 
6932       // When output_width_micro_repeats < output_width_overall_micro_repeats,
6933       // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
6934       // residual_width < 2.
6935       const int adjusted_width_micro_repeats =
6936           (output_width_micro_repeats < output_width_overall_micro_repeats) &&
6937                   (residual_width < 4)
6938               ? output_width_micro_repeats
6939               : output_width_overall_micro_repeats;
6940 
6941       if (block_height == 4) {
6942         for (int s = 0; s < 2; ++s) {
6943           // Work through one slice, by row, at a time.
6944           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6945               output_data_base = output_data_depthwise + 4 * s;
6946 
6947           const int8* next_input_data = scratch_block_data;
6948           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6949               output_data = output_data_base;
6950 
6951           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6952           bias_data += kBiasIncrement;
6953 
6954           const int32x4_t output_shift =
6955               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6956           const int32x4_t output_multiplier =
6957               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6958 
6959           int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
6960           int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
6961           int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
6962 
6963           // Load first sub-micro block of data into operational banks.
6964           input_bank_a_reg =
6965               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
6966                                                 // uninitialized variable.
6967           input_bank_a_reg = vld1q_lane_8x4(
6968               next_input_data + workspace_height_stride, input_bank_a_reg, 2);
6969           input_bank_b_reg = vld1q_dup_s8x4(
6970               next_input_data +
6971               2 * workspace_height_stride);  // Load lane 0, avoiding
6972                                              // uninitialized variable.
6973           input_bank_b_reg =
6974               vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6975                              input_bank_b_reg, 2);
6976           input_bank_c_reg = vld1q_dup_s8x4(
6977               next_input_data +
6978               4 * workspace_height_stride);  // Load lane 0, avoiding
6979                                              // uninitialized variable.
6980           input_bank_c_reg =
6981               vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6982                              input_bank_c_reg, 2);
6983 
6984           int32x4_t acc0;
6985           int32x4_t acc1;
6986           int32x4_t acc2;
6987           int32x4_t acc3;
6988 
6989           acc0 = adjusted_bias_data;
6990           acc1 = adjusted_bias_data;
6991           acc2 = adjusted_bias_data;
6992           acc3 = adjusted_bias_data;
6993 
6994           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
6995           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
6996           acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
6997           acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
6998 
6999           int i_width = 0;
7000           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7001             next_input_data += 4;
7002 
7003             // Iterate over input width shifts within 4x4 blocks.
7004             {
7005               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7006                                          0);
7007               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7008                                          2);
7009               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7010                                          2);
7011               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7012                                          2);
7013               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7014                                          2);
7015               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7016                                          0);
7017               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7018                                          0);
7019               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7020                                          2);
7021 
7022               // Fixed-point multiplication.
7023               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7024               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7025                   acc0, output_shift);
7026               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7027               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7028                   acc1, output_shift);
7029               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7030               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7031                   acc2, output_shift);
7032               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7033               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7034                   acc3, output_shift);
7035               // Add the output offset.
7036               int16x8_t acc_s16_0_1 =
7037                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7038               int16x8_t acc_s16_2_3 =
7039                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7040               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7041               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7042               // Apply the activation function.
7043               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7044                                                  vqmovxn_s16(acc_s16_2_3));
7045               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7046               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7047 
7048               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7049               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7050                               1);
7051               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7052                               acc_u8_all, 2);
7053               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7054                               acc_u8_all, 3);
7055 
7056               output_data += output_depth;
7057             }
7058             // Load next sub-micro block of data.
7059             input_bank_a_reg =
7060                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7061             input_bank_a_reg = vld1q_lane_8x4(
7062                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7063             input_bank_b_reg =
7064                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7065                                input_bank_b_reg, 1);
7066             input_bank_b_reg =
7067                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7068                                input_bank_b_reg, 3);
7069             input_bank_c_reg =
7070                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7071                                input_bank_c_reg, 1);
7072             input_bank_c_reg =
7073                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7074                                input_bank_c_reg, 3);
7075 
7076             {
7077               acc0 = adjusted_bias_data;
7078               acc1 = adjusted_bias_data;
7079               acc2 = adjusted_bias_data;
7080               acc3 = adjusted_bias_data;
7081 
7082               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7083                                          input_bank_a_reg, 0);
7084               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7085                                          input_bank_a_reg, 2);
7086               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7087                                          input_bank_b_reg, 0);
7088               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7089                                          input_bank_a_reg, 2);
7090               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7091                                          input_bank_b_reg, 0);
7092               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7093                                          input_bank_b_reg, 2);
7094               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7095                                          input_bank_b_reg, 0);
7096               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7097                                          input_bank_b_reg, 2);
7098               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7099                                          input_bank_c_reg, 0);
7100               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7101                                          input_bank_b_reg, 2);
7102               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7103                                          input_bank_c_reg, 0);
7104               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7105                                          input_bank_c_reg, 2);
7106 
7107               // Fixed-point multiplication.
7108               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7109               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7110                   acc0, output_shift);
7111               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7112               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7113                   acc1, output_shift);
7114               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7115               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7116                   acc2, output_shift);
7117               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7118               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7119                   acc3, output_shift);
7120               // Add the output offset.
7121               int16x8_t acc_s16_0_1 =
7122                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7123               int16x8_t acc_s16_2_3 =
7124                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7125               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7126               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7127               // Apply the activation function.
7128               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7129                                                  vqmovxn_s16(acc_s16_2_3));
7130               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7131               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7132 
7133               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7134               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7135                               1);
7136               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7137                               acc_u8_all, 2);
7138               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7139                               acc_u8_all, 3);
7140 
7141               input_bank_a_reg = vreinterpretq_s8_u64(
7142                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7143               input_bank_b_reg = vreinterpretq_s8_u64(
7144                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7145               input_bank_c_reg = vreinterpretq_s8_u64(
7146                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7147 
7148               output_data += output_depth;
7149             }
7150 
7151             {
7152               acc0 = adjusted_bias_data;
7153               acc1 = adjusted_bias_data;
7154               acc2 = adjusted_bias_data;
7155               acc3 = adjusted_bias_data;
7156 
7157               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7158                                          0);
7159               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7160                                          2);
7161               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7162                                          0);
7163               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7164                                          2);
7165               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7166                                          0);
7167               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7168                                          2);
7169               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7170                                          0);
7171               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7172                                          2);
7173               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7174                                          0);
7175               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7176                                          2);
7177               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7178                                          0);
7179               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7180                                          2);
7181 
7182               // Fixed-point multiplication.
7183               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7184               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7185                   acc0, output_shift);
7186               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7187               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7188                   acc1, output_shift);
7189               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7190               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7191                   acc2, output_shift);
7192               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7193               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7194                   acc3, output_shift);
7195               // Add the output offset.
7196               int16x8_t acc_s16_0_1 =
7197                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7198               int16x8_t acc_s16_2_3 =
7199                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7200               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7201               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7202               // Apply the activation function.
7203               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7204                                                  vqmovxn_s16(acc_s16_2_3));
7205               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7206               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7207 
7208               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7209               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7210                               1);
7211               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7212                               acc_u8_all, 2);
7213               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7214                               acc_u8_all, 3);
7215 
7216               output_data += output_depth;
7217             }
7218 
7219             {
7220               acc0 = adjusted_bias_data;
7221               acc1 = adjusted_bias_data;
7222               acc2 = adjusted_bias_data;
7223               acc3 = adjusted_bias_data;
7224 
7225               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7226                                          input_bank_a_reg, 0);
7227               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7228                                          input_bank_a_reg, 2);
7229               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7230                                          input_bank_b_reg, 0);
7231               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7232                                          input_bank_a_reg, 2);
7233               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7234                                          input_bank_b_reg, 0);
7235               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7236                                          input_bank_b_reg, 2);
7237               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7238                                          input_bank_b_reg, 0);
7239               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7240                                          input_bank_b_reg, 2);
7241               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7242                                          input_bank_c_reg, 0);
7243               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7244                                          input_bank_b_reg, 2);
7245               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7246                                          input_bank_c_reg, 0);
7247               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7248                                          input_bank_c_reg, 2);
7249 
7250               // Fixed-point multiplication.
7251               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7252               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7253                   acc0, output_shift);
7254               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7255               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7256                   acc1, output_shift);
7257               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7258               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7259                   acc2, output_shift);
7260               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7261               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7262                   acc3, output_shift);
7263               // Add the output offset.
7264               int16x8_t acc_s16_0_1 =
7265                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7266               int16x8_t acc_s16_2_3 =
7267                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7268               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7269               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7270               // Apply the activation function.
7271               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7272                                                  vqmovxn_s16(acc_s16_2_3));
7273               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7274               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7275 
7276               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7277               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7278                               1);
7279               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7280                               acc_u8_all, 2);
7281               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7282                               acc_u8_all, 3);
7283 
7284               input_bank_a_reg = vreinterpretq_s8_u64(
7285                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7286               input_bank_b_reg = vreinterpretq_s8_u64(
7287                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7288               input_bank_c_reg = vreinterpretq_s8_u64(
7289                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7290 
7291               output_data += output_depth;
7292               acc0 = adjusted_bias_data;
7293               acc1 = adjusted_bias_data;
7294               acc2 = adjusted_bias_data;
7295               acc3 = adjusted_bias_data;
7296 
7297               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7298                                          0);
7299               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7300                                          0);
7301               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7302                                          0);
7303               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7304                                          2);
7305             }
7306           }
7307 
7308           if (i_width < output_width_overall_micro_repeats) {
7309             next_input_data += 4;
7310             const int output_width = residual_width;
7311 
7312             // Load next sub-micro block of data.
7313             input_bank_a_reg =
7314                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7315             input_bank_a_reg = vld1q_lane_8x4(
7316                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7317             input_bank_b_reg =
7318                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7319                                input_bank_b_reg, 1);
7320             input_bank_b_reg =
7321                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7322                                input_bank_b_reg, 3);
7323             input_bank_c_reg =
7324                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7325                                input_bank_c_reg, 1);
7326             input_bank_c_reg =
7327                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7328                                input_bank_c_reg, 3);
7329 
7330             // Iterate over input width shifts within 4x4 blocks.
7331             for (int x = 0; x < output_width; ++x) {
7332               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7333                                          0);
7334               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7335                                          2);
7336               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7337                                          2);
7338               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7339                                          2);
7340               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7341                                          2);
7342               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7343                                          0);
7344               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7345                                          0);
7346               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7347                                          2);
7348 
7349               // Fixed-point multiplication.
7350               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7351               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7352                   acc0, output_shift);
7353               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7354               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7355                   acc1, output_shift);
7356               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7357               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7358                   acc2, output_shift);
7359               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7360               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7361                   acc3, output_shift);
7362               // Add the output offset.
7363               int16x8_t acc_s16_0_1 =
7364                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7365               int16x8_t acc_s16_2_3 =
7366                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7367               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7368               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7369               // Apply the activation function.
7370               int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7371                                                  vqmovxn_s16(acc_s16_2_3));
7372               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7373               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7374 
7375               vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7376               vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7377                               1);
7378               vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7379                               acc_u8_all, 2);
7380               vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7381                               acc_u8_all, 3);
7382 
7383               input_bank_a_reg = vreinterpretq_s8_u64(
7384                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
7385               input_bank_b_reg = vreinterpretq_s8_u64(
7386                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
7387               input_bank_c_reg = vreinterpretq_s8_u64(
7388                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
7389 
7390               output_data += output_depth;
7391 
7392               acc0 = adjusted_bias_data;
7393               acc1 = adjusted_bias_data;
7394               acc2 = adjusted_bias_data;
7395               acc3 = adjusted_bias_data;
7396 
7397               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7398                                          0);
7399               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7400                                          0);
7401               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7402                                          0);
7403               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7404                                          2);
7405             }
7406           }
7407           // scratch_block_data += 4 * workspace_height_stride;
7408           output_data_base += 4 * output_height_stride;
7409 
7410           // Move to next sub-block: advance to second set of filters, to new
7411           // bias.
7412           filter_reg_0_a = filter_reg_0_b;
7413           filter_reg_1_a = filter_reg_1_b;
7414           filter_reg_2_a = filter_reg_2_b;
7415           filter_reg_0_a_shifted = vreinterpretq_s8_u32(
7416               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
7417           filter_reg_1_a_shifted = vreinterpretq_s8_u32(
7418               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
7419           filter_reg_2_a_shifted = vreinterpretq_s8_u32(
7420               vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
7421         }
7422       } else {
7423         // Block height < 4.
7424         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7425             output_data_base = output_data_depthwise;
7426 
7427         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
7428         bias_data += kBiasIncrement;
7429         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
7430         bias_data += kBiasIncrement;
7431 
7432         const int32x4_t output_shift_a =
7433             vld1q_s32(output_shift_per_channel + j_depth * 8);
7434         const int32x4_t output_multiplier_a =
7435             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7436         const int32x4_t output_shift_b =
7437             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7438         const int32x4_t output_multiplier_b =
7439             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7440 
7441         for (int k_height = 0; k_height < block_height; ++k_height) {
7442           const int8* next_input_data =
7443               scratch_block_data + k_height * workspace_height_stride;
7444           typename QuantizationTypeImpl<quantization_type>::ExternalType*
7445               output_data = output_data_base;
7446 
7447           int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
7448           int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
7449 
7450           // Load first sub-micro block of data into operational banks.
7451           input_bank_p_reg =
7452               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
7453                                                 // uninitialized variable.
7454           input_bank_p_reg = vld1q_lane_8x4(
7455               next_input_data + workspace_height_stride, input_bank_p_reg, 2);
7456           input_bank_q_reg = vld1q_dup_s8x4(
7457               next_input_data +
7458               2 * workspace_height_stride);  // Load lane 0, avoiding
7459                                              // uninitialized variable.
7460 
7461           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7462                ++i_width) {
7463             next_input_data += 4;
7464             const int output_width =
7465                 i_width == output_width_micro_repeats ? residual_width : 4;
7466 
7467             // Load next sub-micro block of data.
7468             input_bank_p_reg =
7469                 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
7470             input_bank_p_reg = vld1q_lane_8x4(
7471                 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
7472             input_bank_q_reg =
7473                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7474                                input_bank_q_reg, 1);
7475             // Iterate over input width shifts within 4x4 blocks.
7476             for (int x = 0; x < output_width; ++x) {
7477               int32x4_t acc_a = adjusted_bias_data_a;
7478               int32x4_t acc_b = adjusted_bias_data_b;
7479               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
7480                                           input_bank_p_reg, 0);
7481               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
7482                                           input_bank_p_reg, 2);
7483               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
7484                                           input_bank_q_reg, 0);
7485               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
7486                                           input_bank_p_reg, 0);
7487               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
7488                                           input_bank_p_reg, 2);
7489               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
7490                                           input_bank_q_reg, 0);
7491 
7492               // Fixed-point multiplication.
7493               acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
7494               acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
7495               acc_a =
7496                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7497                       acc_a, output_shift_a);
7498               acc_b =
7499                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7500                       acc_b, output_shift_b);
7501               // Add the output offset.
7502               int16x8_t acc_s16_0_0 =
7503                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
7504               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
7505               // Apply the activation function.
7506               int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
7507               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
7508                                         vget_low_s8(output_activation_min_vec));
7509               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
7510                                         vget_low_s8(output_activation_max_vec));
7511 
7512               vst1_s8(output_data, acc_u8_0_0);
7513 
7514               input_bank_p_reg = vreinterpretq_s8_u64(
7515                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
7516               input_bank_q_reg = vreinterpretq_s8_u64(
7517                   vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
7518 
7519               output_data += output_depth;
7520             }
7521           }
7522           output_data_base += output_height_stride;
7523         }
7524       }
7525       output_data_depthwise += 8;
7526     }
7527   }  // NOLINT(readability/fn_size) Manually unrolled.
7528 
7529   static inline void Run(const int8* scratch_block_data,
7530                          const int8* filter_workspace, const int32* bias_data,
7531                          int8* output_block_data,
7532                          const DepthwiseConvDotProdParams* function_params) {
7533     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7534                                output_block_data, function_params);
7535   }
7536 };
7537 
7538 template <>
7539 struct KernelMacroBlock<
7540     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
7541     QuantizationType::kPerChannelInt8,
7542     DepthwiseConvDepthMultiplication::kUnitInputDepth,
7543     /*stride=*/2> {
7544   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
7545   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
7546     return vmin_s8(a, b);
7547   }
7548   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
7549     return vmax_s8(a, b);
7550   }
7551 
7552   static inline void KernelMacroBlockIntrinsics(
7553       const int8* scratch_block_data, const int8* filter_workspace,
7554       const int32* bias_data, int8* output_block_data,
7555       const DepthwiseConvDotProdParams* function_params) {
7556     static constexpr QuantizationType quantization_type =
7557         QuantizationType::kPerChannelInt8;
7558 
7559     const int workspace_height_stride =
7560         function_params->workspace_height_stride;
7561     const int output_width_micro_repeats =
7562         function_params->output_width_micro_repeats;
7563     const int depth_micro_repeats = function_params->depth_micro_repeats;
7564     const int output_depth = function_params->output_depth;
7565     constexpr int kStrideVal = 2;
7566     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
7567 
7568     const int output_width_overall_micro_repeats =
7569         function_params->output_width_overall_micro_repeats;
7570     const int block_height = function_params->outbound_block_height;
7571     const int residual_width = function_params->output_residual_width;
7572     const int output_height_stride = function_params->output_height_stride;
7573     constexpr int kBiasIncrement = 4;
7574 
7575     const int32 output_activation_min =
7576         function_params->quantized_activation_min;
7577     const int32 output_activation_max =
7578         function_params->quantized_activation_max;
7579     const int32 output_offset = function_params->output_offset;
7580     const int32* output_shift_per_channel =
7581         function_params->output_shift_per_channel;
7582     const int32* output_multiplier_per_channel =
7583         function_params->output_multiplier_per_channel;
7584     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
7585       TFLITE_DCHECK_GE(output_activation_min, 0);
7586       TFLITE_DCHECK_LT(output_activation_min, 256);
7587       TFLITE_DCHECK_GE(output_activation_max, 0);
7588       TFLITE_DCHECK_LT(output_activation_max, 256);
7589     } else {
7590       TFLITE_DCHECK_GE(output_activation_min, -128);
7591       TFLITE_DCHECK_LT(output_activation_min, 128);
7592       TFLITE_DCHECK_GE(output_activation_max, -128);
7593       TFLITE_DCHECK_LT(output_activation_max, 128);
7594       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
7595       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
7596     }
7597     TFLITE_DCHECK_GE(output_offset, -32878);
7598     TFLITE_DCHECK_LT(output_offset, 32768);
7599 
7600     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
7601 
7602     const int16x8_t output_offset_vec =
7603         vdupq_n_s16(static_cast<int16>(output_offset));
7604     const int8x16_t output_activation_min_vec =
7605         vdupq_n_s8(static_cast<int8>(output_activation_min));
7606     const int8x16_t output_activation_max_vec =
7607         vdupq_n_s8(static_cast<int8>(output_activation_max));
7608 
7609     for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
7610       int8x16_t filter_reg_0_a;
7611       int8x16_t filter_reg_0_b;
7612       int8x16_t filter_reg_1_a;
7613       int8x16_t filter_reg_1_b;
7614       int8x16_t filter_reg_2_a;
7615       int8x16_t filter_reg_2_b;
7616 
7617       filter_reg_0_a = vld1q_s8(filter_workspace);
7618       filter_workspace += 16;
7619       filter_reg_0_b = vld1q_s8(filter_workspace);
7620       filter_workspace += 16;
7621       filter_reg_1_a = vld1q_s8(filter_workspace);
7622       filter_workspace += 16;
7623       filter_reg_1_b = vld1q_s8(filter_workspace);
7624       filter_workspace += 16;
7625       filter_reg_2_a = vld1q_s8(filter_workspace);
7626       filter_workspace += 16;
7627       filter_reg_2_b = vld1q_s8(filter_workspace);
7628       filter_workspace += 16;
7629 
7630       const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
7631       bias_data += kBiasIncrement;
7632       const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
7633       bias_data += kBiasIncrement;
7634 
7635       const int32x4_t output_shift_s_0 =
7636           vld1q_s32(output_shift_per_channel + j_depth * 8);
7637       const int32x4_t output_multiplier_s_0 =
7638           vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7639       const int32x4_t output_shift_s_1 =
7640           vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7641       const int32x4_t output_multiplier_s_1 =
7642           vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7643 
7644       if (block_height == 2) {
7645         const int8* scratch_data = scratch_block_data;
7646         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7647             output_data = output_block_data + 8 * j_depth;
7648 
7649         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
7650         int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
7651         int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
7652 
7653         // Load first sub-micro block of data into operational banks.
7654         input_bank_a_reg =
7655             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
7656                                            // uninitialized variable.
7657         input_bank_a_reg = vld1q_lane_8x4(
7658             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7659         input_bank_b_reg = vld1q_dup_s8x4(
7660             scratch_data +
7661             2 * workspace_height_stride);  // Load lane 0, avoiding
7662                                            // uninitialized variable.
7663         input_bank_b_reg = vld1q_lane_8x4(
7664             scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
7665         input_bank_c_reg = vld1q_dup_s8x4(
7666             scratch_data +
7667             4 * workspace_height_stride);  // Load lane 0, avoiding
7668                                            // uninitialized variable.
7669 
7670         int32x4_t acc0;
7671         int32x4_t acc1;
7672 
7673         // When output_width_micro_repeats < output_width_overall_micro_repeats,
7674         // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
7675         // residual_width < 2.
7676         const int adjusted_width_micro_repeats =
7677             (output_width_micro_repeats < output_width_overall_micro_repeats) &&
7678                     (residual_width < 2)
7679                 ? output_width_micro_repeats
7680                 : output_width_overall_micro_repeats;
7681 
7682         int i_width = 0;
7683         for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7684           const int8* input_data = scratch_data + 4 + 4 * i_width;
7685 
7686           // Load next sub-micro block of data.
7687           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7688           input_bank_a_reg = vld1q_lane_8x4(
7689               input_data + workspace_height_stride, input_bank_a_reg, 3);
7690           input_bank_b_reg = vld1q_lane_8x4(
7691               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7692           input_bank_b_reg = vld1q_lane_8x4(
7693               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7694           input_bank_c_reg = vld1q_lane_8x4(
7695               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7696 
7697           int16x8_t acc_s16_0_1;
7698           int8x8_t acc_u8_0_1;
7699           // Iterate over input width shifts within 4x4 blocks.
7700           {
7701             acc0 = adjusted_bias_data_s_0;
7702             acc1 = adjusted_bias_data_s_0;
7703 
7704             acc0 =
7705                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7706             acc0 =
7707                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7708             acc0 =
7709                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7710             acc1 =
7711                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7712             acc1 =
7713                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7714             acc1 =
7715                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7716 
7717             // Fixed-point multiplication.
7718             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7719             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7720                 acc0, output_shift_s_0);
7721             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7722             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7723                 acc1, output_shift_s_0);
7724             // Add the output offset.
7725             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7726             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7727             // Apply the activation function.
7728             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7729             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7730                                       vget_low_s8(output_activation_min_vec));
7731             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7732                                       vget_low_s8(output_activation_max_vec));
7733 
7734             vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7735             vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7736 
7737             acc0 = adjusted_bias_data_s_1;
7738             acc1 = adjusted_bias_data_s_1;
7739 
7740             acc0 =
7741                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7742             acc0 =
7743                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7744             acc0 =
7745                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7746             acc1 =
7747                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7748             acc1 =
7749                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7750             acc1 =
7751                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7752 
7753             // Fixed-point multiplication.
7754             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7755             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7756                 acc0, output_shift_s_1);
7757             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7758             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7759                 acc1, output_shift_s_1);
7760             // Add the output offset.
7761             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7762             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7763             // Apply the activation function.
7764             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7765             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7766                                       vget_low_s8(output_activation_min_vec));
7767             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7768                                       vget_low_s8(output_activation_max_vec));
7769 
7770             vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7771             vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7772                            1);
7773 
7774             input_bank_a_reg = vreinterpretq_s8_u64(
7775                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7776             input_bank_b_reg = vreinterpretq_s8_u64(
7777                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7778             input_bank_c_reg = vreinterpretq_s8_u64(
7779                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7780 
7781             output_data += output_depth;
7782           }
7783 
7784           // output_width == four_over_stride.
7785           acc0 = adjusted_bias_data_s_0;
7786           acc1 = adjusted_bias_data_s_0;
7787 
7788           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7789           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7790           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7791           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7792           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7793           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7794 
7795           // Fixed-point multiplication.
7796           acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7797           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7798               acc0, output_shift_s_0);
7799           acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7800           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7801               acc1, output_shift_s_0);
7802           // Add the output offset.
7803           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7804           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7805           // Apply the activation function.
7806           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7807           acc_u8_0_1 =
7808               util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7809           acc_u8_0_1 =
7810               util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7811 
7812           vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7813           vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7814 
7815           acc0 = adjusted_bias_data_s_1;
7816           acc1 = adjusted_bias_data_s_1;
7817 
7818           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7819           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7820           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7821           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7822           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7823           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7824 
7825           // Fixed-point multiplication.
7826           acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7827           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7828               acc0, output_shift_s_1);
7829           acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7830           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7831               acc1, output_shift_s_1);
7832           // Add the output offset.
7833           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7834           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7835           // Apply the activation function.
7836           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7837           acc_u8_0_1 =
7838               util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7839           acc_u8_0_1 =
7840               util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7841 
7842           vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7843           vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
7844 
7845           input_bank_a_reg = vreinterpretq_s8_u64(
7846               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7847           input_bank_b_reg = vreinterpretq_s8_u64(
7848               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7849           input_bank_c_reg = vreinterpretq_s8_u64(
7850               vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7851 
7852           output_data += output_depth;
7853         }
7854         for (; i_width < output_width_overall_micro_repeats; ++i_width) {
7855           // output_width == 1.
7856           const int8* input_data = scratch_data + 4 + 4 * i_width;
7857 
7858           // Load next sub-micro block of data.
7859           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7860           input_bank_a_reg = vld1q_lane_8x4(
7861               input_data + workspace_height_stride, input_bank_a_reg, 3);
7862           input_bank_b_reg = vld1q_lane_8x4(
7863               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7864           input_bank_b_reg = vld1q_lane_8x4(
7865               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7866           input_bank_c_reg = vld1q_lane_8x4(
7867               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7868 
7869           int16x8_t acc_s16_0_1;
7870           int8x8_t acc_u8_0_1;
7871           // Iterate over input width shifts within 4x4 blocks.
7872           {
7873             acc0 = adjusted_bias_data_s_0;
7874             acc1 = adjusted_bias_data_s_0;
7875 
7876             acc0 =
7877                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7878             acc0 =
7879                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7880             acc0 =
7881                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7882             acc1 =
7883                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7884             acc1 =
7885                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7886             acc1 =
7887                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7888 
7889             // Fixed-point multiplication.
7890             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7891             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7892                 acc0, output_shift_s_0);
7893             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7894             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7895                 acc1, output_shift_s_0);
7896             // Add the output offset.
7897             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7898             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7899             // Apply the activation function.
7900             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7901             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7902                                       vget_low_s8(output_activation_min_vec));
7903             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7904                                       vget_low_s8(output_activation_max_vec));
7905 
7906             vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7907             vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7908 
7909             acc0 = adjusted_bias_data_s_1;
7910             acc1 = adjusted_bias_data_s_1;
7911 
7912             acc0 =
7913                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7914             acc0 =
7915                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7916             acc0 =
7917                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7918             acc1 =
7919                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7920             acc1 =
7921                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7922             acc1 =
7923                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7924 
7925             // Fixed-point multiplication.
7926             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7927             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7928                 acc0, output_shift_s_1);
7929             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7930             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7931                 acc1, output_shift_s_1);
7932             // Add the output offset.
7933             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7934             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7935             // Apply the activation function.
7936             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7937             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7938                                       vget_low_s8(output_activation_min_vec));
7939             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7940                                       vget_low_s8(output_activation_max_vec));
7941 
7942             vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7943             vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7944                            1);
7945 
7946             input_bank_a_reg = vreinterpretq_s8_u64(
7947                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7948             input_bank_b_reg = vreinterpretq_s8_u64(
7949                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7950             input_bank_c_reg = vreinterpretq_s8_u64(
7951                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7952 
7953             output_data += output_depth;
7954           }
7955         }
7956       } else {
7957         TFLITE_DCHECK_EQ(block_height, 1);
7958         // Work through one slice, by row, at a time.
7959         const int8* scratch_data = scratch_block_data;
7960         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7961             output_data = output_block_data + 8 * j_depth;
7962 
7963         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
7964         int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
7965 
7966         // Load first sub-micro block of data into operational banks.
7967         input_bank_a_reg =
7968             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
7969                                            // uninitialized variable.
7970         input_bank_a_reg = vld1q_lane_8x4(
7971             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7972         input_bank_b_reg = vld1q_dup_s8x4(
7973             scratch_data +
7974             2 * workspace_height_stride);  // Load lane 0, avoiding
7975                                            // uninitialized variable.
7976 
7977         int32x4_t acc0;
7978         int32x4_t acc1;
7979 
7980         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7981              ++i_width) {
7982           const int output_width =
7983               i_width == output_width_micro_repeats ? residual_width : 2;
7984 
7985           TFLITE_DCHECK_LE(output_width, 2);
7986           TFLITE_DCHECK_GE(output_width, 1);
7987           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
7988           const int8* input_data = scratch_data + 4 + 4 * i_width;
7989 
7990           // Load next sub-micro block of data.
7991           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7992           input_bank_a_reg = vld1q_lane_8x4(
7993               input_data + workspace_height_stride, input_bank_a_reg, 3);
7994           input_bank_b_reg = vld1q_lane_8x4(
7995               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7996 
7997           int16x8_t acc_s16_0_1;
7998           int8x8_t acc_u8_0_1;
7999 
8000           // Iterate over input width shifts within 4x4 blocks.
8001           {
8002             acc0 = adjusted_bias_data_s_0;
8003 
8004             acc0 =
8005                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
8006             acc0 =
8007                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
8008             acc0 =
8009                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
8010 
8011             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
8012             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8013                 acc0, output_shift_s_0);
8014 
8015             // Second sub-block accumulation.
8016             acc1 = adjusted_bias_data_s_1;
8017 
8018             acc1 =
8019                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
8020             acc1 =
8021                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
8022             acc1 =
8023                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
8024 
8025             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
8026             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8027                 acc1, output_shift_s_1);
8028 
8029             // Add the output offset.
8030             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
8031             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
8032             // Apply the activation function.
8033             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
8034             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
8035                                       vget_low_s8(output_activation_min_vec));
8036             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
8037                                       vget_low_s8(output_activation_max_vec));
8038 
8039             // This stores the results for both sub-blocks together.
8040             vst1_s8(output_data, acc_u8_0_1);
8041 
8042             input_bank_a_reg = vreinterpretq_s8_u64(
8043                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
8044             input_bank_b_reg = vreinterpretq_s8_u64(
8045                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
8046 
8047             output_data += output_depth;
8048           }
8049           if (output_width == 2) {
8050             acc0 = adjusted_bias_data_s_0;
8051 
8052             acc0 =
8053                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
8054             acc0 =
8055                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
8056             acc0 =
8057                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
8058 
8059             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
8060             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8061                 acc0, output_shift_s_0);
8062 
8063             // Second sub-block accumulation.
8064             acc1 = adjusted_bias_data_s_1;
8065 
8066             acc1 =
8067                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
8068             acc1 =
8069                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
8070             acc1 =
8071                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
8072 
8073             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
8074             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8075                 acc1, output_shift_s_1);
8076 
8077             // Add the output offset.
8078             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
8079             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
8080             // Apply the activation function.
8081             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
8082             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
8083                                       vget_low_s8(output_activation_min_vec));
8084             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
8085                                       vget_low_s8(output_activation_max_vec));
8086 
8087             // This stores the results for both sub-blocks together.
8088             vst1_s8(output_data, acc_u8_0_1);
8089 
8090             input_bank_a_reg = vreinterpretq_s8_u64(
8091                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
8092             input_bank_b_reg = vreinterpretq_s8_u64(
8093                 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
8094 
8095             output_data += output_depth;
8096           }
8097         }
8098       }
8099     }
8100   }
8101 
8102   static inline void Run(const int8* scratch_block_data,
8103                          const int8* filter_workspace, const int32* bias_data,
8104                          int8* output_block_data,
8105                          const DepthwiseConvDotProdParams* function_params) {
8106     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
8107                                output_block_data, function_params);
8108   }
8109 };
8110 
8111 #undef vst1_lane_s8x4
8112 #undef vst1_lane_u8x4
8113 #undef vst1q_lane_s8x4
8114 #undef vst1q_lane_u8x4
8115 #undef vld1q_lane_s8x8
8116 #undef vld1_lane_8x4
8117 #undef vld1q_lane_8x4
8118 #undef vld1q_dup_s8x4
8119 
8120 #endif  //  USE_NEON
8121 
8122 }  // namespace depthwise_conv
8123 }  // namespace optimized_ops
8124 }  // namespace tflite
8125 
8126 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
8127