1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
17
18 // This file provides kernel implementations that are not used in shipped
19 // inference code, but rather (a) show how model C++ code is designed and then
20 // transformed into asm code, and (b) aid with maintenance and later development
21 // of variations. Many projects (even including, say, the classic NAG libraries)
22 // develop highly optimized code, but do not maintain intermediate versions.
23 // Often the result is incomprehensible final-version code.
24
25 #include <algorithm>
26
27 #include "tensorflow/lite/kernels/internal/compatibility.h"
28 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
31 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33
34 namespace tflite {
35 namespace optimized_ops {
36 namespace depthwise_conv {
37
38 #ifdef USE_NEON
39
util_vst1_u8(uint8 * data_addr,uint8x8_t reg)40 inline void util_vst1_u8(uint8* data_addr, uint8x8_t reg) {
41 return vst1_u8(data_addr, reg);
42 }
util_vst1_x8(uint8 * data_addr,int8x8_t reg)43 inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
44 return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
45 }
util_vst1_x8(int8 * data_addr,int8x8_t reg)46 inline void util_vst1_x8(int8* data_addr, int8x8_t reg) {
47 return vst1_s8(data_addr, reg);
48 }
49
50 // Lane operations are for clarity and convenience. We want to load and store
51 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
52 // 32-bit stores. Stores require 32-bit alignment.
53
54 #define vst1_lane_s8x4(dst, reg, lane_num) \
55 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
56 vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_s8(reg), \
57 lane_num)
58 #define vst1_lane_u8x4(dst, reg, lane_num) \
59 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
60 vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_u8(reg), \
61 lane_num)
62 #define vst1q_lane_s8x4(dst, reg, lane_num) \
63 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
64 vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_s8(reg), \
65 lane_num)
66 #define vst1q_lane_u8x4(dst, reg, lane_num) \
67 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
68 vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_u8(reg), \
69 lane_num)
70
71 // Important! Most compilation configurations will compile and run without
72 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
73 // obscure bug or mis-feature probably in unhygienic macro expansion.
74 #define vld1q_lane_s8x8(src, reg, lane_num) \
75 vreinterpretq_s8_u64(vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), \
76 vreinterpretq_u64_s8(reg), lane_num))
77 #define vld1_lane_8x4(src, reg, lane_num) \
78 vreinterpret_s8_s32(vld1_lane_s32(reinterpret_cast<const int32*>(src), \
79 vreinterpret_s32_s8(reg), lane_num))
80 #define vld1q_lane_8x4(src, reg, lane_num) \
81 vreinterpretq_s8_s32(vld1q_lane_s32(reinterpret_cast<const int32*>(src), \
82 vreinterpretq_s32_s8(reg), lane_num))
83 #define vld1q_dup_s8x4(src) \
84 vreinterpretq_s8_s32(vld1q_dup_s32(reinterpret_cast<const int32*>(src)))
85
86 #endif // USE_NEON
87
88 template <QuantizationType quantization_type>
89 struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
90 quantization_type> {
91 // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
92 // width 3, sub-block 0 or 1, depth 4. Filter data is written as
93 // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
94 //
95 // Note that this rearrangement is much like that performed on input data when
96 // filling the workspace, and optimized versions will be similar.
97 static inline void FillFilterBank(int depth, const uint8* filter_block,
98 int8 filter_bank[3][2][4][4]) {
99 constexpr int kSymmetricZeroPoint =
100 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
101 // Load filter data in, 8-bytes down depth / sub-block at a time.
102 //
103 // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
104 // depth 4.
105 uint8 loaded_filter[3][4][2][4];
106 for (int y = 0; y < 3; ++y) {
107 for (int x = 0; x < 3; ++x) {
108 memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
109 8);
110 }
111 // Pad the filter with symmetric representation of 0, so that the values
112 // become 0 when the zero-poing is added below. Thus these filter taps are
113 // effectively disregarded in later filtering.
114 memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
115 }
116 for (int y = 0; y < 3; ++y) {
117 for (int z = 0; z < 4; ++z) {
118 for (int x = 0; x < 4; ++x) {
119 filter_bank[y][0][z][x] =
120 loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
121 filter_bank[y][1][z][x] =
122 loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
123 }
124 }
125 }
126 }
127
128 // Adjust the bias (weights) data according to the input offset.
129 //
130 // The output calculation is
131 // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
132 // (filter[i][j][d] + filter_offset)
133 // (where offsets are expressed as differences from 128).
134 //
135 // Since we cannot efficiently handle varying offsets / bias across the image,
136 // we insist on filter_offset = 0.
137 //
138 // This function calculates
139 // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
140 // which accounts for input offset. If the bias is constant over the depth,
141 // the adjusted bias will vary.
142 static inline void AdjustBias(int32 input_offset,
143 const int8 filter_bank[3][2][4][4],
144 const int32* bias_data,
145 int32 adjusted_bias_block[2][4]) {
146 constexpr int kSymmetricZeroPoint =
147 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
148 TFLITE_DCHECK_GE(input_offset, -255);
149 TFLITE_DCHECK_LE(input_offset, 0);
150 // For instance, if input_offset == 128, no adjustment is needed.
151 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
152
153 for (int s = 0; s < 2; ++s) {
154 for (int z = 0; z < 4; ++z) {
155 adjusted_bias_block[s][z] = bias_data[4 * s + z];
156 for (int i = 0; i < 9; ++i) {
157 adjusted_bias_block[s][z] +=
158 input_offset_difference * filter_bank[i % 3][s][z][i / 3];
159 }
160 }
161 }
162 }
163
164 static void Run(const uint8* filter_data, const int32* bias_data,
165 int8* shuffled_filter_data, int32* adjusted_bias_data,
166 const DepthwiseConvDotProdParams* function_params) {
167 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
168 const int depth = function_params->output_depth;
169 const int depth_micro_repeats = function_params->depth_micro_repeats;
170 const int bias_increment = function_params->bias_increment;
171 const int32 input_offset = function_params->input_offset;
172
173 int8 filter_bank[3][2][4][4];
174 int32 adjusted_bias_block[2][4];
175
176 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
177 FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
178 AdjustBias(input_offset, filter_bank,
179 bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
180
181 memcpy(shuffled_filter_data, filter_bank[0][0][0],
182 shuffled_filter_increment);
183 shuffled_filter_data += shuffled_filter_increment;
184 memcpy(adjusted_bias_data, adjusted_bias_block[0],
185 8 * sizeof(adjusted_bias_block[0][0]));
186 adjusted_bias_data += 8;
187 }
188 }
189 };
190
191 template <QuantizationType quantization_type>
192 struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
193 quantization_type> {
194 static inline void Run(const uint8* filter_data, const int32* bias_data,
195 int8* shuffled_filter_data, int32* adjusted_bias_data,
196 const DepthwiseConvDotProdParams* function_params) {
197 const int depth = function_params->output_depth;
198 const int depth_micro_repeats = function_params->depth_micro_repeats;
199 const int bias_increment = function_params->bias_increment;
200
201 // Simulate NEON-register transposition of subset of filter.
202 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
203 int8 filter_bank_a_1[4][4];
204 int8 filter_bank_a_2[4][4];
205 int8 filter_bank_b_0[4][4];
206 int8 filter_bank_b_1[4][4];
207 int8 filter_bank_b_2[4][4];
208
209 // Load filter data in, essentially dropping the [depth/8] dimension, which
210 // is equivalent to loading just the depth needed for one micro-block.
211 //
212 // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
213 // depth 4.
214 uint8 loaded_filter_0[4][2][4];
215 uint8 loaded_filter_1[4][2][4];
216 uint8 loaded_filter_2[4][2][4];
217
218 constexpr int kSymmetricZeroPoint =
219 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
220 const int32 input_offset = function_params->input_offset;
221 TFLITE_DCHECK_GE(input_offset, -255);
222 TFLITE_DCHECK_LE(input_offset, 0);
223 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
224
225 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
226 const uint8* filter_block = filter_data + 8 * j_depth;
227
228 // Filter data is provided as filter_block[3][3][depth/8][2][4].
229 // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
230 // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
231 for (int x = 0; x < 3; ++x) {
232 memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
233 8);
234 memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
235 8);
236 memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
237 8);
238 }
239 // Pad the filter with -filter_offset, so that the values become 0 when
240 // the filter_offset is later added, and so the filter tap is effectively
241 // disregarded.
242 memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
243 memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
244 memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
245
246 for (int z = 0; z < 4; ++z) {
247 for (int x = 0; x < 4; ++x) {
248 filter_bank_a_0[z][x] =
249 loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
250 filter_bank_b_0[z][x] =
251 loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
252 filter_bank_a_1[z][x] =
253 loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
254 filter_bank_b_1[z][x] =
255 loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
256 filter_bank_a_2[z][x] =
257 loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
258 filter_bank_b_2[z][x] =
259 loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
260 }
261 }
262
263 memcpy(shuffled_filter_data, filter_bank_a_0, 16);
264 shuffled_filter_data += 16;
265 memcpy(shuffled_filter_data, filter_bank_b_0, 16);
266 shuffled_filter_data += 16;
267 memcpy(shuffled_filter_data, filter_bank_a_1, 16);
268 shuffled_filter_data += 16;
269 memcpy(shuffled_filter_data, filter_bank_b_1, 16);
270 shuffled_filter_data += 16;
271 memcpy(shuffled_filter_data, filter_bank_a_2, 16);
272 shuffled_filter_data += 16;
273 memcpy(shuffled_filter_data, filter_bank_b_2, 16);
274 shuffled_filter_data += 16;
275
276 int32 adjusted_bias_data_0[4];
277 int32 adjusted_bias_data_1[4];
278 // For instance, if input_offset == 128, no adjustment is needed.
279 for (int z = 0; z < 4; ++z) {
280 adjusted_bias_data_0[z] = bias_data[z];
281 adjusted_bias_data_1[z] = bias_data[4 + z];
282 for (int x = 0; x < 4; ++x) {
283 adjusted_bias_data_0[z] +=
284 input_offset_difference * filter_bank_a_0[z][x];
285 adjusted_bias_data_0[z] +=
286 input_offset_difference * filter_bank_a_1[z][x];
287 adjusted_bias_data_0[z] +=
288 input_offset_difference * filter_bank_a_2[z][x];
289 adjusted_bias_data_1[z] +=
290 input_offset_difference * filter_bank_b_0[z][x];
291 adjusted_bias_data_1[z] +=
292 input_offset_difference * filter_bank_b_1[z][x];
293 adjusted_bias_data_1[z] +=
294 input_offset_difference * filter_bank_b_2[z][x];
295
296 adjusted_bias_data[z] = adjusted_bias_data_0[z];
297 adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
298 }
299 }
300 bias_data += 2 * bias_increment;
301 adjusted_bias_data += 8;
302 }
303 }
304 };
305
306 #ifdef USE_NEON
307 template <QuantizationType quantization_type>
308 struct ProcessPerDepth<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
309 quantization_type> {
310 static void ProcessPerDepthIntrinsics(
311 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
312 filter_data,
313 const int32* bias_data, int8* shuffled_filter_data,
314 int32* adjusted_bias_data,
315 const DepthwiseConvDotProdParams* function_params) {
316 const int depth = function_params->output_depth;
317 const int depth_micro_repeats = function_params->depth_micro_repeats;
318 const int bias_increment = function_params->bias_increment;
319
320 constexpr int kSymmetricZeroPoint =
321 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
322 constexpr uint8 kSignBit =
323 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
324 const int32 input_offset = function_params->input_offset;
325 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
326 TFLITE_DCHECK_GE(input_offset, -255);
327 TFLITE_DCHECK_LE(input_offset, 0);
328 }
329 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
330 const int8x16_t ones_vector = vdupq_n_s8(1);
331
332 // Simulate NEON-register transposition of subset of filter.
333 int8x16_t input_0_a;
334 int8x16_t input_0_b;
335 int8x16_t input_0_c;
336 int8x16_t input_1_a;
337 int8x16_t input_1_b;
338 int8x16_t input_1_c;
339 int8x16_t input_2_a;
340 int8x16_t input_2_b;
341 int8x16_t input_2_c;
342
343 int8x16_t filter_0_a;
344 int8x16_t filter_0_b;
345 int8x16_t filter_1_a;
346 int8x16_t filter_1_b;
347 int8x16_t filter_2_a;
348 int8x16_t filter_2_b;
349
350 // For uint8, effect subtraction of zero-point = 128 by XOR of sign bit.
351 const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
352
353 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
354 filter_block = filter_data;
355 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
356 // Filter data is provided as filter_block[3][3][depth/8][2][4].
357 // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
358 // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
359
360 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
361 filter_block_ptr = filter_block;
362 input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
363 filter_block_ptr += depth;
364 input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
365 filter_block_ptr += depth;
366 input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
367 filter_block_ptr += depth;
368 input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
369 filter_block_ptr += depth;
370 input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
371 filter_block_ptr += depth;
372 input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
373 filter_block_ptr += depth;
374 input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
375 filter_block_ptr += depth;
376 input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
377 filter_block_ptr += depth;
378 input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
379
380 filter_0_a = vzip1q_s8(input_0_a, input_0_b);
381 filter_0_b = vzip1q_s8(input_0_c, sign_bit);
382 filter_1_a = vzip1q_s8(input_1_a, input_1_b);
383 filter_1_b = vzip1q_s8(input_1_c, sign_bit);
384 filter_2_a = vzip1q_s8(input_2_a, input_2_b);
385 filter_2_b = vzip1q_s8(input_2_c, sign_bit);
386 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
387 filter_0_a = veorq_s8(filter_0_a, sign_bit);
388 filter_0_b = veorq_s8(filter_0_b, sign_bit);
389 filter_1_a = veorq_s8(filter_1_a, sign_bit);
390 filter_1_b = veorq_s8(filter_1_b, sign_bit);
391 filter_2_a = veorq_s8(filter_2_a, sign_bit);
392 filter_2_b = veorq_s8(filter_2_b, sign_bit);
393 }
394 vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
395 vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
396 vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
397
398 vst1q_s8(shuffled_filter_data, filter_0_a);
399 shuffled_filter_data += 16;
400 vst1q_s8(shuffled_filter_data, filter_0_b);
401 shuffled_filter_data += 16;
402 vst1q_s8(shuffled_filter_data, filter_1_a);
403 shuffled_filter_data += 16;
404 vst1q_s8(shuffled_filter_data, filter_1_b);
405 shuffled_filter_data += 16;
406 vst1q_s8(shuffled_filter_data, filter_2_a);
407 shuffled_filter_data += 16;
408 vst1q_s8(shuffled_filter_data, filter_2_b);
409 shuffled_filter_data += 16;
410
411 int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
412 bias_data += bias_increment;
413 int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
414 bias_data += bias_increment;
415 // For instance, if input_offset is kIntSymmetricZeroPoint, no adjustment
416 // is needed.
417
418 int32x4_t filter_sum_a = vdupq_n_s32(0);
419 filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
420 filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
421 filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
422 int32x4_t filter_sum_b = vdupq_n_s32(0);
423 filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
424 filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
425 filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
426
427 adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
428 input_offset_difference);
429 adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
430 input_offset_difference);
431
432 vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
433 adjusted_bias_data += 4;
434 vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
435 adjusted_bias_data += 4;
436
437 filter_block += 8;
438 }
439 }
440
441 static inline void Run(const typename QuantizationTypeImpl<
442 quantization_type>::ExternalType* filter_data,
443 const int32* bias_data, int8* shuffled_filter_data,
444 int32* adjusted_bias_data,
445 const DepthwiseConvDotProdParams* function_params) {
446 ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
447 adjusted_bias_data, function_params);
448 }
449 };
450 #endif
451
452 template <QuantizationType quantization_type, int32 max_padding>
453 struct PackMacroBlock<
454 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
455 DepthwiseConvDepthMultiplication::kNoMultiplication, max_padding> {
456 // A straight copy of a macro block of input data into a scratch buffer.
457 //
458 // Requirement: depth_micro_repeats > 0.
459 static inline void CopyMacroBlock(
460 int32 height_block_number, int32 width_block_number,
461 const DepthwiseConvDotProdParams& function_params,
462 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
463 input_block_data,
464 int8* scratch_block_data) {
465 TFLITE_DCHECK_LE(max_padding, 1);
466
467 // Strides.
468 // The input depth and count of micro blocks provide the width strides.
469 const int input_height_stride = function_params.input_height_stride;
470 const int workspace_height_stride = function_params.workspace_height_stride;
471 const int input_depth = function_params.input_depth;
472 const int depth_micro_repeats = function_params.depth_micro_repeats;
473 TFLITE_DCHECK_GT(depth_micro_repeats, 0);
474
475 // Remaining iteration and dimension parameters.
476 //
477 // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
478 // final micro block is incomplete.
479 const int width_overall_micro_repeats =
480 function_params.input_width_overall_micro_repeats;
481 int input_width_micro_repeats = function_params.input_width_micro_repeats;
482 const int residual_width = function_params.residual_width;
483 const int block_height = function_params.inbound_block_height;
484
485 const int padding_left = function_params.padding_left;
486 const int padding_right = function_params.padding_right;
487 const int padding_top = function_params.padding_top;
488 const int padding_bottom = function_params.padding_bottom;
489
490 const bool leading_width_padding =
491 padding_left > 0 && width_block_number == 0;
492 const bool trailing_width_padding =
493 padding_right > 0 &&
494 width_block_number == (function_params.width_macro_count - 1);
495 const bool leading_height_padding =
496 padding_top > 0 && height_block_number < 0;
497 const bool trailing_height_padding =
498 padding_bottom > 0 &&
499 height_block_number == (function_params.height_macro_count - 1);
500
501 // Modify the trailing case to reflect the input width.
502 int input_residual_width =
503 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
504 : 4;
505 if (trailing_width_padding) {
506 input_residual_width -= 1;
507 input_width_micro_repeats = width_overall_micro_repeats - 1;
508 }
509
510 constexpr int kSymmetricZeroPoint =
511 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
512 const int32 input_offset_difference =
513 function_params.input_offset + kSymmetricZeroPoint;
514
515 // We load data into a temporary buffer and then save, to match subsequent
516 // processing. This will make it easier to combine stages into one ASM
517 // routine.
518 int8 tmp_load[4][2][4];
519
520 int copy_block_height = block_height;
521 if (leading_height_padding) {
522 memset(scratch_block_data, -input_offset_difference,
523 workspace_height_stride);
524 scratch_block_data += workspace_height_stride;
525 input_block_data += input_height_stride;
526 copy_block_height -= 1;
527 }
528 if (trailing_height_padding) {
529 copy_block_height -= 1;
530 }
531
532 // The outer 3 loops go through all the micro blocks in a macro block.
533 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
534 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
535 // Figure out division of work (available input vs trailing padding).
536 int adjusted_residual_width =
537 j_width == input_width_micro_repeats ? input_residual_width : 4;
538
539 int start_width = 0;
540 if (leading_width_padding && j_width == 0) {
541 start_width = 1;
542 memset(tmp_load[0][0], -input_offset_difference, 8);
543 }
544 if (adjusted_residual_width < 4) {
545 for (int x = adjusted_residual_width; x < 4; ++x) {
546 memset(tmp_load[x][0], -input_offset_difference, 8);
547 }
548 }
549
550 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
551 // The inner 3 loops go through the sub-block, depth and width within
552 // each micro block.
553
554 // Load, and apply symmetric offset.
555 int8* scratch_data =
556 scratch_block_data + k_height * workspace_height_stride +
557 j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
558 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
559 input_data = input_block_data + k_height * input_height_stride +
560 j_width * 4 * input_depth + i_depth * 8;
561 // Full-size macro blocks are 2*4*4 = 32 bytes.
562 for (int x = start_width; x < adjusted_residual_width; ++x) {
563 for (int s = 0; s < 2; ++s) {
564 for (int d = 0; d < 4; ++d) {
565 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
566 kSymmetricZeroPoint;
567 }
568 }
569 }
570
571 // Save results.
572 memcpy(&scratch_data[0], tmp_load[0][0], 8);
573 memcpy(&scratch_data[8], tmp_load[1][0], 8);
574 memcpy(&scratch_data[16], tmp_load[2][0], 8);
575 memcpy(&scratch_data[24], tmp_load[3][0], 8);
576 }
577 }
578 }
579
580 if (trailing_height_padding) {
581 memset(scratch_block_data + copy_block_height * workspace_height_stride,
582 -input_offset_difference, workspace_height_stride);
583 }
584 }
585
586 // Transpose 4x4 blocks within each sub-micro-block.
587 //
588 // Implemented somewhat like NEON register manipulation, so that we can see
589 // equivalence of the two approaches.
590 static inline void MicroTransposeBlocks(
591 const DepthwiseConvDotProdParams& function_params,
592 int8* scratch_block_data) {
593 const int workspace_height_stride = function_params.workspace_height_stride;
594 const int width_overall_micro_repeats =
595 function_params.input_width_overall_micro_repeats;
596 const int depth_micro_repeats = function_params.depth_micro_repeats;
597 const int block_height = function_params.inbound_block_height;
598
599 // Transpositions are 4x4, but doing 2 at a time is more efficient in the
600 // NEON code we are simulating.
601 int8 tmp_load[4][2][4]; // [width][sub-block][depth]
602 int8 tmp_transposed[4][2][4]; // [depth][sub-block][width]
603 int8 tmp_interleaved[2][4][4]; // [sub-block][depth][width]
604
605 // The outer 3 loops go through all the micro blocks in a macro block.
606 for (int k_height = 0; k_height < block_height; ++k_height) {
607 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
608 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
609 int8* scratch_data =
610 scratch_block_data + k_height * workspace_height_stride +
611 j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
612 // A. Load data
613 memcpy(tmp_load[0][0], &scratch_data[0], 8);
614 memcpy(tmp_load[1][0], &scratch_data[8], 8);
615 memcpy(tmp_load[2][0], &scratch_data[16], 8);
616 memcpy(tmp_load[3][0], &scratch_data[24], 8);
617
618 // B. Simulate between-register transposition.
619 for (int x = 0; x < 4; ++x) {
620 for (int y = 0; y < 4; ++y) {
621 tmp_transposed[x][0][y] = tmp_load[y][0][x];
622 tmp_transposed[x][1][y] = tmp_load[y][1][x];
623 }
624 }
625
626 // C. Simulate between-register interleaving.
627 for (int x = 0; x < 4; ++x) {
628 for (int y = 0; y < 4; ++y) {
629 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
630 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
631 }
632 }
633 // D. Simulate mangled storage arrangement.
634 memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
635 memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
636 }
637 }
638 }
639 }
640
641 static inline void Run(
642 int32 height_block_number, int32 width_block_number,
643 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
644 input_block_data,
645 int8* scratch_block_data,
646 const DepthwiseConvDotProdParams* function_params) {
647 CopyMacroBlock(height_block_number, width_block_number, *function_params,
648 input_block_data, scratch_block_data);
649 MicroTransposeBlocks(*function_params, scratch_block_data);
650 }
651 };
652
653 template <QuantizationType quantization_type, int32 max_padding>
654 struct PackMacroBlock<
655 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
656 DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
657 static inline void Run(
658 int32 height_block_number, int32 width_block_number,
659 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
660 input_block_data,
661 int8* scratch_block_data,
662 const DepthwiseConvDotProdParams* function_params) {
663 // Currently support for padding is limited to 1 on any side.
664 TFLITE_DCHECK_LE(max_padding, 1);
665
666 // Strides.
667 // The count of micro blocks (below) provides the width strides.
668 const int input_height_stride = function_params->input_height_stride;
669 const int workspace_height_stride =
670 function_params->workspace_height_stride;
671
672 // Remaining iteration and dimension parameters.
673 //
674 // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
675 // final micro block is incomplete.
676 const int width_overall_micro_repeats =
677 function_params->input_width_overall_micro_repeats;
678 const int input_width_micro_repeats =
679 function_params->input_width_micro_repeats;
680 const int residual_width = function_params->residual_width;
681 const int block_height = function_params->inbound_block_height;
682 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
683
684 const int padding_left = function_params->padding_left;
685 const int padding_right = function_params->padding_right;
686 const int padding_top = function_params->padding_top;
687 const int padding_bottom = function_params->padding_bottom;
688
689 const bool leading_width_padding =
690 padding_left > 0 && width_block_number == 0;
691 const bool trailing_width_padding =
692 padding_right > 0 &&
693 width_block_number == (function_params->width_macro_count - 1);
694 const bool leading_height_padding =
695 padding_top > 0 && height_block_number < 0;
696 const bool trailing_height_padding =
697 padding_bottom > 0 &&
698 height_block_number == (function_params->height_macro_count - 1);
699
700 constexpr int kSymmetricZeroPoint =
701 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
702 const int32 input_offset_difference =
703 function_params->input_offset + kSymmetricZeroPoint;
704
705 int copy_block_height = block_height;
706 if (leading_height_padding) {
707 memset(scratch_block_data, -input_offset_difference,
708 workspace_height_stride + kWorkspaceExtension);
709 scratch_block_data += workspace_height_stride;
710 input_block_data += input_height_stride;
711 copy_block_height -= 1;
712 }
713 if (trailing_height_padding) {
714 copy_block_height -= 1;
715 }
716
717 int adjusted_residual_width =
718 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
719 : 4;
720
721 if (trailing_width_padding) {
722 adjusted_residual_width -= 1;
723 }
724 int start_width = 0;
725 if (leading_width_padding) {
726 start_width = 1;
727 input_block_data += 1;
728 }
729
730 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
731 adjusted_residual_width - start_width;
732
733 TFLITE_DCHECK_LE(
734 copy_size,
735 input_height_stride - width_block_number * input_width_micro_repeats);
736 // We may drop up to stride-1 of trailing input.
737 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
738
739 // When there is unit input depth, the micro-block iteration need only be
740 // through the height. The micro blocks are contiguous across the width.
741 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
742 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
743 input_data = input_block_data + k_height * input_height_stride;
744 int8* scratch_data =
745 scratch_block_data + k_height * workspace_height_stride;
746
747 // Handle leading padding. This is overwritten if there is no padding.
748 scratch_data[0] = -input_offset_difference;
749
750 memcpy(&scratch_data[start_width], input_data, copy_size);
751 for (int i = 0; i < copy_size; ++i) {
752 scratch_data[start_width + i] += -kSymmetricZeroPoint;
753 }
754
755 // Handle trailing padding, and fill in remainder of micro block.
756 memset(&scratch_data[start_width + copy_size], -input_offset_difference,
757 4 - adjusted_residual_width + kWorkspaceExtension);
758 }
759
760 if (trailing_height_padding) {
761 memset(scratch_block_data + copy_block_height * workspace_height_stride,
762 -input_offset_difference,
763 workspace_height_stride + kWorkspaceExtension);
764 }
765 }
766 };
767
768 // Beginning of code section containing intermediate code transformation.
769 //
770 // This section is only compiled when kUseUnwound3x3DotProduct versions of
771 // templated functions are selected.
772 template <QuantizationType quantization_type>
773 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
774 quantization_type,
775 DepthwiseConvDepthMultiplication::kNoMultiplication,
776 /*max_padding=*/0> {
777 static inline void Run(
778 int32 height_block_number, int32 width_block_number,
779 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
780 input_block_data,
781 int8* scratch_block_data,
782 const DepthwiseConvDotProdParams* function_params) {
783 const int workspace_height_stride =
784 function_params->workspace_height_stride;
785 const int width_overall_micro_repeats =
786 function_params->input_width_overall_micro_repeats;
787 const int input_width_micro_repeats =
788 function_params->input_width_micro_repeats;
789 const int depth_micro_repeats = function_params->depth_micro_repeats;
790 const int block_height = function_params->inbound_block_height;
791 const int residual_width = function_params->residual_width;
792 const int input_height_stride = function_params->input_height_stride;
793 const int input_depth = function_params->input_depth;
794
795 TFLITE_DCHECK_GE(depth_micro_repeats, 0);
796 constexpr int kSymmetricZeroPoint =
797 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
798 const int micro_block_size = 4 * 8;
799 const int depth_advance = width_overall_micro_repeats * micro_block_size;
800 const int width_advance =
801 micro_block_size *
802 (1 - depth_micro_repeats * width_overall_micro_repeats);
803 const int height_advance = workspace_height_stride -
804 width_overall_micro_repeats * micro_block_size;
805 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
806
807 // Transpositions are 4x4, but doing 2 at a time is more efficient in the
808 // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
809 // down the depth.
810 int8 tmp_load[4][2][4];
811 int8 tmp_transposed[4][2][4];
812 int8 tmp_interleaved[2][4][4];
813
814 // Work through one slice, by row, at a time.
815 int8* scratch_data = scratch_block_data;
816 for (int k_height = 0; k_height < block_height; ++k_height) {
817 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
818 input_data = input_block_data;
819 input_block_data += input_height_stride;
820
821 // Traverse the width one point at a time, but the depth in (micro) blocks
822 // of size 8.
823 //
824 // The depth and width margins, which are filled with "zeros", may be
825 // larger than is strictly needed to calculate output. This is because the
826 // conv calculation is performed across complete micro blocks.
827 for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
828 // Load, then zero.
829 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
830 // A. Simulate register loading.
831 for (int x = 0; x < 4; ++x) {
832 for (int s = 0; s < 2; ++s) {
833 for (int d = 0; d < 4; ++d) {
834 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
835 kSymmetricZeroPoint;
836 }
837 }
838 }
839 // B. Simulate between-register transposition.
840 for (int x = 0; x < 4; ++x) {
841 for (int y = 0; y < 4; ++y) {
842 tmp_transposed[x][0][y] = tmp_load[y][0][x];
843 tmp_transposed[x][1][y] = tmp_load[y][1][x];
844 }
845 }
846
847 // C and D are to be performed together as 4-byte stores in NEON code.
848 // C. Simulate between-register interleaving.
849 for (int x = 0; x < 4; ++x) {
850 for (int y = 0; y < 4; ++y) {
851 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
852 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
853 }
854 }
855 // D. Simulate mangled storage arrangement.
856 memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
857 memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
858 memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
859 memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
860
861 scratch_data += depth_advance;
862 input_data += 8;
863 }
864 scratch_data += width_advance;
865 input_data += input_depth_skip;
866 }
867 if (width_overall_micro_repeats > input_width_micro_repeats) {
868 TFLITE_DCHECK_EQ(width_overall_micro_repeats,
869 input_width_micro_repeats + 1);
870 TFLITE_DCHECK_GT(residual_width, 0);
871 // Figure out division of work (available input vs zero-ed).
872 const int adjusted_residual_width = residual_width;
873 // Load, then zero.
874 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
875 // A. Simulate register loading.
876 for (int x = 0; x < adjusted_residual_width; ++x) {
877 for (int s = 0; s < 2; ++s) {
878 for (int d = 0; d < 4; ++d) {
879 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
880 kSymmetricZeroPoint;
881 }
882 }
883 }
884 for (int x = adjusted_residual_width; x < 4; ++x) {
885 for (int s = 0; s < 2; ++s) {
886 for (int d = 0; d < 4; ++d) {
887 tmp_load[x][s][d] = 0;
888 }
889 }
890 }
891 // B. Simulate between-register transposition.
892 for (int x = 0; x < 4; ++x) {
893 for (int y = 0; y < 4; ++y) {
894 tmp_transposed[x][0][y] = tmp_load[y][0][x];
895 tmp_transposed[x][1][y] = tmp_load[y][1][x];
896 }
897 }
898
899 // C and D are to be performed together as 4-byte stores in NEON code.
900 // C. Simulate between-register interleaving.
901 for (int x = 0; x < 4; ++x) {
902 for (int y = 0; y < 4; ++y) {
903 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
904 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
905 }
906 }
907 // D. Simulate mangled storage arrangement.
908 memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
909 memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
910 memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
911 memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
912
913 scratch_data += depth_advance;
914 input_data += 8;
915 }
916 scratch_data += width_advance;
917 input_data += input_depth_skip;
918 }
919 scratch_data += height_advance;
920 }
921
922 TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
923 block_height * workspace_height_stride);
924 }
925 };
926
927 template <QuantizationType quantization_type>
928 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
929 quantization_type,
930 DepthwiseConvDepthMultiplication::kNoMultiplication,
931 /*max_padding=*/1> {
932 static inline void Run(
933 int32 height_block_number, int32 width_block_number,
934 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
935 input_block_data,
936 int8* scratch_block_data,
937 const DepthwiseConvDotProdParams* function_params) {
938 // Just use C model code for case of padding. Optimized versions merge the
939 // modifications therein to handle padding.
940 PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
941 quantization_type,
942 DepthwiseConvDepthMultiplication::kNoMultiplication,
943 /*max_padding=*/1>::Run(height_block_number,
944 width_block_number, input_block_data,
945 scratch_block_data, function_params);
946 }
947 };
948
949 template <QuantizationType quantization_type, int32 max_padding>
950 struct PackMacroBlock<
951 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
952 DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
953 static inline void Run(
954 int32 height_block_number, int32 width_block_number,
955 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
956 input_block_data,
957 int8* scratch_block_data,
958 const DepthwiseConvDotProdParams* function_params) {
959 const int workspace_height_stride =
960 function_params->workspace_height_stride;
961 const int width_overall_micro_repeats =
962 function_params->input_width_overall_micro_repeats;
963 const int input_width_micro_repeats =
964 function_params->input_width_micro_repeats;
965 const int block_height = function_params->inbound_block_height;
966 const int residual_width = function_params->residual_width;
967 const int input_height_stride = function_params->input_height_stride;
968
969 const int padding_left = function_params->padding_left;
970 const int padding_right = function_params->padding_right;
971 const int padding_top = function_params->padding_top;
972 const int padding_bottom = function_params->padding_bottom;
973
974 constexpr int kSymmetricZeroPoint =
975 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
976
977 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
978
979 const bool leading_width_padding =
980 padding_left > 0 && width_block_number == 0;
981 const bool trailing_width_padding =
982 padding_right > 0 &&
983 width_block_number == (function_params->width_macro_count - 1);
984 const bool leading_height_padding =
985 padding_top > 0 && height_block_number < 0;
986 const bool trailing_height_padding =
987 padding_bottom > 0 &&
988 height_block_number == (function_params->height_macro_count - 1);
989
990 const int32 input_offset = function_params->input_offset;
991 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
992
993 // Work through one slice, by row, at a time.
994 int8* scratch_data_base = scratch_block_data;
995
996 int copy_block_height = block_height;
997 if (leading_height_padding) {
998 copy_block_height -= 1;
999 memset(scratch_data_base, -input_offset_difference,
1000 workspace_height_stride + kWorkspaceExtension);
1001 scratch_data_base += workspace_height_stride;
1002 input_block_data += input_height_stride;
1003 }
1004 if (trailing_height_padding) {
1005 copy_block_height -= 1;
1006 }
1007
1008 int adjusted_residual_width =
1009 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1010 : 4;
1011
1012 if (trailing_width_padding) {
1013 adjusted_residual_width -= 1;
1014 }
1015 int start_width = 0;
1016 if (leading_width_padding) {
1017 start_width = 1;
1018 input_block_data += 1;
1019 }
1020
1021 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1022 adjusted_residual_width - start_width;
1023 // Adjusted so that later conditionals are simplified.
1024 const int copy_size_adjusted =
1025 trailing_width_padding ? copy_size + 1 : copy_size;
1026
1027 TFLITE_DCHECK_LE(
1028 copy_size,
1029 input_height_stride - width_block_number * input_width_micro_repeats);
1030 // We may drop up to stride-1 of trailing input.
1031 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1032
1033 // This is used to simulate what should happen in registers.
1034 int8 tmp_data[16];
1035
1036 int scratch_data_offset = 0;
1037 int input_block_offset = 0;
1038
1039 if (copy_size >= 16) {
1040 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1041 // Work through one slice, by row, at a time.
1042 int8* scratch_data = scratch_data_base + scratch_data_offset;
1043
1044 int copy_done = 0;
1045
1046 // The surrounding condition ensures that we always need at least one
1047 // iteration of the main copy loop. In the case of leading width
1048 // padding, we unroll this specially.
1049 if (leading_width_padding) {
1050 memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
1051 for (int i = 0; i < 16; ++i) {
1052 tmp_data[i] += -kSymmetricZeroPoint;
1053 }
1054 tmp_data[0] = -input_offset_difference;
1055 memcpy(scratch_data, tmp_data, 16);
1056 copy_done += 15;
1057 }
1058
1059 // Main copy loop.
1060 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
1061 memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1062 16);
1063 for (int i = 0; i < 16; ++i) {
1064 tmp_data[i] += -kSymmetricZeroPoint;
1065 }
1066 TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
1067 memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
1068 }
1069
1070 const int copy_remaining = copy_size - copy_done;
1071 // Total amount
1072 // = copy_size - copy_done + 4 - adjusted_residual_width
1073 // = width_overall_micro_repeats * 4 - start_width - copy_done.
1074 // Undone micro blocks
1075 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1076
1077 // Conditional is (copy_remaining > 0 || trailing_width_padding).
1078 if (copy_done < copy_size_adjusted) {
1079 // Employ overlapping-load strategy in order to load full register,
1080 // but use only part.
1081 memcpy(tmp_data,
1082 input_block_data + input_block_offset + copy_done -
1083 (16 - copy_remaining),
1084 16);
1085 // Shift to select the part that we need.
1086 for (int i = 0; i < copy_remaining; ++i) {
1087 tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
1088 }
1089 for (int i = 0; i < 16; ++i) {
1090 tmp_data[i] += -kSymmetricZeroPoint;
1091 }
1092 // Apply padding to remainder, some unnecessary but costless in regs.
1093 for (int i = copy_remaining; i < 16; ++i) {
1094 tmp_data[i] = -input_offset_difference;
1095 }
1096 const int final_repeats =
1097 width_overall_micro_repeats - (start_width + copy_done) / 4;
1098 for (int i = 0; i < final_repeats; ++i) {
1099 memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
1100 copy_done += 4;
1101 }
1102 }
1103 memset(scratch_data + start_width + copy_done, -input_offset_difference,
1104 kWorkspaceExtension);
1105
1106 scratch_data_offset += workspace_height_stride;
1107 input_block_offset += input_height_stride;
1108 }
1109 } else if (copy_size >= 4) {
1110 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1111 // Work through one slice, by row, at a time.
1112 int8* scratch_data = scratch_data_base + scratch_data_offset;
1113
1114 int copy_done = 0;
1115
1116 // The surrounding condition ensures that we always need at least one
1117 // iteration of the main copy loop. In the case of leading width
1118 // padding, we unroll this specially.
1119 if (leading_width_padding) {
1120 memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
1121 for (int i = 0; i < 4; ++i) {
1122 tmp_data[i] += -kSymmetricZeroPoint;
1123 }
1124 tmp_data[0] = -input_offset_difference;
1125 memcpy(scratch_data, tmp_data, 4);
1126 copy_done += 3;
1127 }
1128
1129 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
1130 memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1131 4);
1132 for (int i = 0; i < 4; ++i) {
1133 tmp_data[i] += -kSymmetricZeroPoint;
1134 }
1135 // Perform as 4 int32 stores, because that is our alignment.
1136 memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1137 }
1138
1139 // Total amount
1140 // = copy_size - copy_done + 4 - adjusted_residual_width
1141 // = width_overall_micro_repeats * 4 - start_width - copy_done.
1142 // Undone micro blocks
1143 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1144 const int copy_remaining = copy_size - copy_done;
1145 // Conditional is (copy_remaining > 0 || trailing_width_padding).
1146 if (copy_done < copy_size_adjusted) {
1147 TFLITE_DCHECK_LT(copy_remaining, 4);
1148 // Employ overlapping-load strategy in order to load full register,
1149 // but use only part.
1150 memcpy(tmp_data,
1151 input_block_data + input_block_offset + copy_done -
1152 (4 - copy_remaining),
1153 4);
1154 // Shift to select the part that we need.
1155 for (int i = 0; i < copy_remaining; ++i) {
1156 tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
1157 }
1158 for (int i = 0; i < 4; ++i) {
1159 tmp_data[i] += -kSymmetricZeroPoint;
1160 }
1161 // Apply padding to remainder, some unnecessary but costless in regs.
1162 for (int i = copy_remaining; i < 4; ++i) {
1163 tmp_data[i] = -input_offset_difference;
1164 }
1165 memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1166 copy_done += 4;
1167 }
1168 memset(scratch_data + start_width + copy_done, -input_offset_difference,
1169 kWorkspaceExtension);
1170
1171 scratch_data_offset += workspace_height_stride;
1172 input_block_offset += input_height_stride;
1173 }
1174 } else if (width_overall_micro_repeats == 2) {
1175 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1176 // Apply padding by quick fill of whole reg.
1177 for (int i = 0; i < 8; ++i) {
1178 tmp_data[i] = -input_offset;
1179 }
1180 for (int i = 0; i < copy_size; ++i) {
1181 // Apply shift-left insert, tmp_data as both operands.
1182 // The zero-index byte is left unchanged.
1183 for (int i = 7; i > 0; --i) {
1184 tmp_data[i] = tmp_data[i - 1];
1185 }
1186 tmp_data[1] =
1187 input_block_data[input_block_offset + (copy_size - 1 - i)];
1188 }
1189 if (!leading_width_padding) {
1190 // Remove leading padding, junking trailing byte, OK because max size
1191 // is less than 8.
1192 TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1193 for (int i = 0; i < 7; ++i) {
1194 tmp_data[i] = tmp_data[i + 1];
1195 }
1196 }
1197 for (int i = 0; i < 8; ++i) {
1198 tmp_data[i] += -kSymmetricZeroPoint;
1199 }
1200 memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
1201 memset(scratch_data_base + scratch_data_offset + 8,
1202 -input_offset_difference, kWorkspaceExtension);
1203
1204 scratch_data_offset += workspace_height_stride;
1205 input_block_offset += input_height_stride;
1206 }
1207 } else {
1208 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
1209 // This path is basically the same as the preceding, 2-micro-block one,
1210 // but here we simply store fewer bytes.
1211 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1212 // Apply padding by quick fill of whole reg.
1213 for (int i = 0; i < 8; ++i) {
1214 tmp_data[i] = -input_offset;
1215 }
1216 for (int i = 0; i < copy_size; ++i) {
1217 // Apply shift-left insert, tmp_data as both operands.
1218 // The zero-index byte is left unchanged.
1219 for (int i = 7; i > 0; --i) {
1220 tmp_data[i] = tmp_data[i - 1];
1221 }
1222 tmp_data[1] =
1223 input_block_data[input_block_offset + (copy_size - 1 - i)];
1224 }
1225 if (!leading_width_padding) {
1226 // Remove leading padding, junking trailing byte, OK because max size
1227 // is less than 8.
1228 TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1229 for (int i = 0; i < 7; ++i) {
1230 tmp_data[i] = tmp_data[i + 1];
1231 }
1232 }
1233 for (int i = 0; i < 8; ++i) {
1234 tmp_data[i] += -kSymmetricZeroPoint;
1235 }
1236 memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
1237 memset(scratch_data_base + scratch_data_offset + 4,
1238 -input_offset_difference, kWorkspaceExtension);
1239
1240 scratch_data_offset += workspace_height_stride;
1241 input_block_offset += input_height_stride;
1242 }
1243 }
1244
1245 scratch_data_base += copy_block_height * workspace_height_stride;
1246
1247 if (trailing_height_padding) {
1248 memset(scratch_data_base, -input_offset_difference,
1249 workspace_height_stride + kWorkspaceExtension);
1250 scratch_data_base += workspace_height_stride;
1251 }
1252
1253 TFLITE_DCHECK_EQ(
1254 scratch_data_base,
1255 scratch_block_data + block_height * workspace_height_stride);
1256 }
1257 };
1258 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
1259 // of templated functions are selected.
1260 //
1261 // End of code section containing intermediate code transformation.
1262
1263 #ifdef USE_NEON
1264 template <QuantizationType quantization_type>
1265 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1266 quantization_type,
1267 DepthwiseConvDepthMultiplication::kNoMultiplication,
1268 /*max_padding=*/0> {
1269 static inline void PackMacroBlockIntrinsics(
1270 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1271 input_block_data,
1272 int8* scratch_block_data,
1273 const DepthwiseConvDotProdParams* function_params) {
1274 TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
1275 TFLITE_DCHECK_EQ(function_params->padding_top, 0);
1276 TFLITE_DCHECK_EQ(function_params->padding_left, 0);
1277 TFLITE_DCHECK_EQ(function_params->padding_right, 0);
1278 const int workspace_height_stride =
1279 function_params->workspace_height_stride;
1280 const int width_overall_micro_repeats =
1281 function_params->input_width_overall_micro_repeats;
1282 const int input_width_micro_repeats =
1283 function_params->input_width_micro_repeats;
1284 const int depth_micro_repeats = function_params->depth_micro_repeats;
1285 const int block_height = function_params->inbound_block_height;
1286 const int residual_width = function_params->residual_width;
1287 const int input_height_stride = function_params->input_height_stride;
1288 const int input_depth = function_params->input_depth;
1289
1290 TFLITE_DCHECK_GE(depth_micro_repeats, 0);
1291 constexpr uint8 kSignBit =
1292 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1293 const int micro_block_size = 4 * 8;
1294 const int depth_advance = width_overall_micro_repeats * micro_block_size;
1295 const int width_advance =
1296 micro_block_size *
1297 (1 - depth_micro_repeats * width_overall_micro_repeats);
1298 const int height_advance = workspace_height_stride -
1299 width_overall_micro_repeats * micro_block_size;
1300 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1301
1302 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1303 // code. Note the blocks of 4x4 are still interleaved down the depth.
1304 int8x16_t work_reg_a;
1305 int8x16_t work_reg_b;
1306
1307 // Effect subtraction of zero-point = 128 by XOR of sign bit.
1308 const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1309
1310 // Work through one slice, by row, at a time.
1311 int8* scratch_data_0 = scratch_block_data;
1312
1313 for (int k_height = 0; k_height < block_height; ++k_height) {
1314 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1315 input_data_0 = input_block_data;
1316 int8x16_t input_data_a;
1317 int8x16_t input_data_b;
1318 int8x16_t input_data_c;
1319 int8x16_t input_data_d;
1320
1321 // Traverse the width one point at a time, but the depth in (micro) blocks
1322 // of size 8.
1323 //
1324 // The depth and width margins, which are filled with "zeros", may be
1325 // larger than is strictly needed to calculate output. This is because the
1326 // conv calculation is performed across complete micro blocks.
1327 for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
1328 int8x16_t work_reg_a_sp;
1329 int8x16_t work_reg_b_sp;
1330
1331 int i_depth = 0;
1332
1333 if (depth_micro_repeats >= 2) {
1334 i_depth += 2;
1335
1336 input_data_a = util_vld1q_x8(input_data_0);
1337 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1338 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1339 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1340 input_data_0 += 16;
1341
1342 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1343 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1344 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1345 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1346 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1347 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1348 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1349 }
1350
1351 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1352 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1353 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1354
1355 input_data_a = util_vld1q_x8(input_data_0);
1356 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1357 vst1q_s8(scratch_data_0, work_reg_a);
1358 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1359
1360 scratch_data_0 += depth_advance;
1361
1362 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1363 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1364 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1365 }
1366
1367 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1368 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1369 vst1q_s8(scratch_data_0, work_reg_a_sp);
1370 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1371
1372 scratch_data_0 += depth_advance;
1373 input_data_0 += 16;
1374 }
1375
1376 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1377 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1378 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1379 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1380 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1381 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1382 }
1383 vst1q_s8(scratch_data_0, work_reg_a);
1384 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1385
1386 scratch_data_0 += depth_advance;
1387
1388 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1389 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1390 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1391 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1392 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1393 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1394 }
1395
1396 vst1q_s8(scratch_data_0, work_reg_a_sp);
1397 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1398
1399 scratch_data_0 += depth_advance;
1400 }
1401 for (; i_depth < depth_micro_repeats; ++i_depth) {
1402 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1403 input_data_b =
1404 vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
1405 input_data_c =
1406 vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
1407 input_data_d =
1408 vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
1409 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1410 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1411
1412 input_data_0 += 8;
1413
1414 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1415 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1416 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1417 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1418 }
1419
1420 vst1q_s8(scratch_data_0, work_reg_a);
1421 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1422
1423 scratch_data_0 += depth_advance;
1424 }
1425 scratch_data_0 += width_advance;
1426 input_data_0 += input_depth_skip;
1427 }
1428 if (width_overall_micro_repeats > input_width_micro_repeats) {
1429 TFLITE_DCHECK_EQ(width_overall_micro_repeats,
1430 input_width_micro_repeats + 1);
1431 TFLITE_DCHECK_GT(residual_width, 0);
1432 TFLITE_DCHECK_LT(residual_width, 4);
1433 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1434 input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1435 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1436 input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1437 if (residual_width > 1) {
1438 input_data_b =
1439 vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
1440 if (residual_width == 3) {
1441 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1442 input_data_c, 0);
1443 }
1444 }
1445 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1446 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1447
1448 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1449 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1450 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1451 }
1452 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1453
1454 vst1q_s8(scratch_data_0, work_reg_a);
1455 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1456
1457 scratch_data_0 += depth_advance;
1458 input_data_0 += 8;
1459 }
1460 scratch_data_0 += width_advance;
1461 input_data_0 += input_depth_skip;
1462 }
1463
1464 scratch_data_0 += height_advance;
1465 input_block_data += input_height_stride;
1466 }
1467 TFLITE_DCHECK_EQ(
1468 scratch_data_0,
1469 scratch_block_data + block_height * workspace_height_stride);
1470 }
1471
1472 static inline void Run(
1473 int32 height_block_number, int32 width_block_number,
1474 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1475 input_block_data,
1476 int8* scratch_block_data,
1477 const DepthwiseConvDotProdParams* function_params) {
1478 #ifdef __aarch64__
1479 PreloadInputBlock(input_block_data, function_params);
1480 #endif
1481 PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
1482 function_params);
1483 }
1484 };
1485
1486 template <QuantizationType quantization_type>
1487 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1488 quantization_type,
1489 DepthwiseConvDepthMultiplication::kNoMultiplication,
1490 /*max_padding=*/1> {
1491 static inline void PackMacroBlockIntrinsics(
1492 int32 height_block_number, int32 width_block_number,
1493 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1494 input_block_data,
1495 int8* scratch_block_data,
1496 const DepthwiseConvDotProdParams* function_params) {
1497 constexpr uint8 kSignBit =
1498 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1499
1500 const int workspace_height_stride =
1501 function_params->workspace_height_stride;
1502 const int width_overall_micro_repeats =
1503 function_params->input_width_overall_micro_repeats;
1504 const int input_width_micro_repeats =
1505 function_params->input_width_micro_repeats;
1506 const int depth_micro_repeats = function_params->depth_micro_repeats;
1507 const int block_height = function_params->inbound_block_height;
1508 const int residual_width = function_params->residual_width;
1509 const int input_height_stride = function_params->input_height_stride;
1510 const int input_depth = function_params->input_depth;
1511
1512 const int padding_left = function_params->padding_left;
1513 const int padding_right = function_params->padding_right;
1514 const int padding_top = function_params->padding_top;
1515 const int padding_bottom = function_params->padding_bottom;
1516
1517 TFLITE_DCHECK_GT(depth_micro_repeats, 0);
1518 constexpr int kSymmetricZeroPoint =
1519 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1520
1521 const int micro_block_size = 4 * 8;
1522 const int depth_advance = width_overall_micro_repeats * micro_block_size;
1523 const int width_advance =
1524 micro_block_size *
1525 (1 - depth_micro_repeats * width_overall_micro_repeats);
1526 const int height_advance = workspace_height_stride -
1527 width_overall_micro_repeats * micro_block_size;
1528 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1529
1530 const bool leading_width_padding =
1531 padding_left > 0 && width_block_number == 0;
1532 const bool trailing_width_padding =
1533 padding_right > 0 &&
1534 width_block_number == (function_params->width_macro_count - 1);
1535 const bool leading_height_padding =
1536 padding_top > 0 && height_block_number < 0;
1537 const bool trailing_height_padding =
1538 padding_bottom > 0 &&
1539 height_block_number == (function_params->height_macro_count - 1);
1540
1541 const int32 input_offset = function_params->input_offset;
1542 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1543
1544 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1545 // code. Note the blocks of 4x4 are still interleaved down the depth.
1546 int8x16_t work_reg_a;
1547 int8x16_t work_reg_b;
1548
1549 // Effect subtraction of zero-point = 128 by XOR of sign bit.
1550 const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
1551
1552 // Work through one slice, by row, at a time.
1553 int8* scratch_data_0 = scratch_block_data;
1554
1555 int copy_block_height = block_height;
1556 if (leading_height_padding) {
1557 copy_block_height -= 1;
1558 memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1559 scratch_data_0 += workspace_height_stride;
1560 input_block_data += input_height_stride;
1561 }
1562 if (trailing_height_padding) {
1563 copy_block_height -= 1;
1564 }
1565
1566 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1567 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1568 input_data_0 = input_block_data;
1569 int8x16_t input_data_a;
1570 int8x16_t input_data_b;
1571 int8x16_t input_data_c;
1572 int8x16_t input_data_d;
1573
1574 // Traverse the width one point at a time, but the depth in (micro) blocks
1575 // of size 8.
1576 //
1577 // The depth and width margins, which are filled with "zeros", may be
1578 // larger than is strictly needed to calculate output. This is because the
1579 // conv calculation is performed across complete micro blocks.
1580 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
1581 // Figure out division of work (available input vs zero-ed).
1582 int adjusted_residual_width =
1583 j_width == (input_width_micro_repeats) ? residual_width : 4;
1584
1585 if (trailing_width_padding &&
1586 j_width == (width_overall_micro_repeats - 1)) {
1587 adjusted_residual_width -= 1;
1588 }
1589 int start_width = 0;
1590 if (leading_width_padding && j_width == 0) {
1591 start_width = 1;
1592 }
1593 if (start_width == 0) {
1594 if (adjusted_residual_width == 4) {
1595 int8x16_t work_reg_a_sp;
1596 int8x16_t work_reg_b_sp;
1597
1598 int i_depth = 0;
1599
1600 if (depth_micro_repeats >= 2) {
1601 i_depth += 2;
1602
1603 input_data_a = util_vld1q_x8(input_data_0);
1604 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1605 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1606 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1607 input_data_0 += 16;
1608
1609 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1610 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1611 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1612 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1613 if (quantization_type ==
1614 QuantizationType::kNonPerChannelUint8) {
1615 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1616 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1617 }
1618
1619 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1620 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1621 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1622
1623 input_data_a = util_vld1q_x8(input_data_0);
1624 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1625 vst1q_s8(scratch_data_0, work_reg_a);
1626 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1627
1628 scratch_data_0 += depth_advance;
1629
1630 if (quantization_type ==
1631 QuantizationType::kNonPerChannelUint8) {
1632 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1633 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1634 }
1635
1636 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1637 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1638 vst1q_s8(scratch_data_0, work_reg_a_sp);
1639 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1640
1641 scratch_data_0 += depth_advance;
1642 input_data_0 += 16;
1643 }
1644
1645 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1646 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1647 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1648 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1649 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1650 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1651 }
1652 vst1q_s8(scratch_data_0, work_reg_a);
1653 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1654
1655 scratch_data_0 += depth_advance;
1656
1657 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1658 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1659 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1660 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1661 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1662 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1663 }
1664
1665 vst1q_s8(scratch_data_0, work_reg_a_sp);
1666 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1667
1668 scratch_data_0 += depth_advance;
1669 }
1670 for (; i_depth < depth_micro_repeats; ++i_depth) {
1671 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1672 input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1673 input_data_b, 0);
1674 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1675 input_data_c, 0);
1676 input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1677 input_data_d, 0);
1678 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1679 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1680
1681 input_data_0 += 8;
1682
1683 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1684 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1685 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1686 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1687 }
1688
1689 vst1q_s8(scratch_data_0, work_reg_a);
1690 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1691
1692 scratch_data_0 += depth_advance;
1693 }
1694 scratch_data_0 += width_advance;
1695 input_data_0 += input_depth_skip;
1696 } else {
1697 TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1698 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1699 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1700 input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1701 input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1702 input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1703 if (adjusted_residual_width > 0) {
1704 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1705 if (adjusted_residual_width > 1) {
1706 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1707 input_data_b, 0);
1708 if (adjusted_residual_width == 3) {
1709 input_data_c = vld1q_lane_s8x8(
1710 input_data_0 + 2 * input_depth, input_data_c, 0);
1711 }
1712 }
1713 }
1714 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1715 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1716
1717 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1718 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1719 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1720 }
1721 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1722
1723 vst1q_s8(scratch_data_0, work_reg_a);
1724 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1725
1726 scratch_data_0 += depth_advance;
1727 input_data_0 += 8;
1728 }
1729 scratch_data_0 += width_advance;
1730 input_data_0 += input_depth_skip;
1731 }
1732 } else {
1733 if (adjusted_residual_width == 4) {
1734 int8x16_t work_reg_a_sp;
1735 int8x16_t work_reg_b_sp;
1736
1737 int i_depth = 0;
1738
1739 if (depth_micro_repeats >= 2) {
1740 i_depth += 2;
1741
1742 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1743 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1744 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1745 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1746 input_data_0 += 16;
1747
1748 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1749 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1750 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1751 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1752 if (quantization_type ==
1753 QuantizationType::kNonPerChannelUint8) {
1754 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1755 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1756 }
1757
1758 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1759 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1760 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1761
1762 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1763 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1764 vst1q_s8(scratch_data_0, work_reg_a);
1765 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1766
1767 scratch_data_0 += depth_advance;
1768
1769 if (quantization_type ==
1770 QuantizationType::kNonPerChannelUint8) {
1771 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1772 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1773 }
1774
1775 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1776 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1777 vst1q_s8(scratch_data_0, work_reg_a_sp);
1778 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1779
1780 scratch_data_0 += depth_advance;
1781 input_data_0 += 16;
1782 }
1783
1784 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1785 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1786 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1787 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1788 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1789 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1790 }
1791 vst1q_s8(scratch_data_0, work_reg_a);
1792 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1793
1794 scratch_data_0 += depth_advance;
1795
1796 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1797 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1798 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1799 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1800 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1801 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1802 }
1803
1804 vst1q_s8(scratch_data_0, work_reg_a_sp);
1805 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1806
1807 scratch_data_0 += depth_advance;
1808 }
1809 for (; i_depth < depth_micro_repeats; ++i_depth) {
1810 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1811 input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1812 input_data_b, 0);
1813 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1814 input_data_c, 0);
1815 input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1816 input_data_d, 0);
1817 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1818 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1819
1820 input_data_0 += 8;
1821
1822 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1823 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1824 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1825 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1826 }
1827
1828 vst1q_s8(scratch_data_0, work_reg_a);
1829 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1830
1831 scratch_data_0 += depth_advance;
1832 }
1833 scratch_data_0 += width_advance;
1834 input_data_0 += input_depth_skip;
1835 } else {
1836 TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1837
1838 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1839 input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1840 input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1841 input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1842 input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
1843 // Skip loading first column.
1844 if (adjusted_residual_width > 1) {
1845 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1846 input_data_b, 0);
1847 if (adjusted_residual_width == 3) {
1848 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1849 input_data_c, 0);
1850 }
1851 }
1852 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1853 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1854
1855 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1856 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1857 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1858 }
1859 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1860
1861 vst1q_s8(scratch_data_0, work_reg_a);
1862 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1863
1864 scratch_data_0 += depth_advance;
1865 input_data_0 += 8;
1866 }
1867 scratch_data_0 += width_advance;
1868 input_data_0 += input_depth_skip;
1869 }
1870 }
1871 }
1872 scratch_data_0 += height_advance;
1873 input_block_data += input_height_stride;
1874 }
1875
1876 if (trailing_height_padding) {
1877 memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1878 scratch_data_0 += workspace_height_stride;
1879 }
1880
1881 TFLITE_DCHECK_EQ(
1882 scratch_data_0,
1883 scratch_block_data + block_height * workspace_height_stride);
1884 }
1885
1886 static inline void Run(
1887 int32 height_block_number, int32 width_block_number,
1888 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1889 input_block_data,
1890 int8* scratch_block_data,
1891 const DepthwiseConvDotProdParams* function_params) {
1892 #ifdef __aarch64__
1893 PreloadInputBlock(input_block_data, function_params);
1894 #endif
1895
1896 PackMacroBlockIntrinsics(height_block_number, width_block_number,
1897 input_block_data, scratch_block_data,
1898 function_params);
1899 }
1900 };
1901
1902 template <QuantizationType quantization_type>
1903 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1904 quantization_type,
1905 DepthwiseConvDepthMultiplication::kUnitInputDepth,
1906 /*max_padding=*/1> {
1907 static inline void PackMacroBlockIntrinsics(
1908 int32 height_block_number, int32 width_block_number,
1909 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1910 input_block_data,
1911 int8* scratch_block_data,
1912 const DepthwiseConvDotProdParams* function_params) {
1913 const int workspace_height_stride =
1914 function_params->workspace_height_stride;
1915 const int width_overall_micro_repeats =
1916 function_params->input_width_overall_micro_repeats;
1917 const int input_width_micro_repeats =
1918 function_params->input_width_micro_repeats;
1919 const int block_height = function_params->inbound_block_height;
1920 const int residual_width = function_params->residual_width;
1921 const int input_height_stride = function_params->input_height_stride;
1922
1923 const int padding_left = function_params->padding_left;
1924 const int padding_right = function_params->padding_right;
1925 const int padding_top = function_params->padding_top;
1926 const int padding_bottom = function_params->padding_bottom;
1927
1928 constexpr int kSymmetricZeroPoint =
1929 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1930
1931 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
1932
1933 const bool leading_width_padding =
1934 padding_left > 0 && width_block_number == 0;
1935 const bool trailing_width_padding =
1936 padding_right > 0 &&
1937 width_block_number == (function_params->width_macro_count - 1);
1938 const bool leading_height_padding =
1939 padding_top > 0 && height_block_number < 0;
1940 const bool trailing_height_padding =
1941 padding_bottom > 0 &&
1942 height_block_number == (function_params->height_macro_count - 1);
1943
1944 const int32 input_offset = function_params->input_offset;
1945 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1946
1947 // Work through one slice, by row, at a time.
1948 int8* scratch_data_base = scratch_block_data;
1949
1950 int copy_block_height = block_height;
1951 if (leading_height_padding) {
1952 copy_block_height -= 1;
1953 memset(scratch_data_base, -input_offset_difference,
1954 workspace_height_stride + kWorkspaceExtension);
1955 scratch_data_base += workspace_height_stride;
1956 input_block_data += input_height_stride;
1957 }
1958 if (trailing_height_padding) {
1959 copy_block_height -= 1;
1960 }
1961
1962 int adjusted_residual_width =
1963 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1964 : 4;
1965
1966 if (trailing_width_padding) {
1967 adjusted_residual_width -= 1;
1968 }
1969 int start_width = 0;
1970 if (leading_width_padding) {
1971 start_width = 1;
1972 input_block_data += 1;
1973 }
1974
1975 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1976 adjusted_residual_width - start_width;
1977 // Adjusted so that later conditionals are simplified.
1978 const int copy_size_adjusted =
1979 trailing_width_padding ? copy_size + 1 : copy_size;
1980
1981 TFLITE_DCHECK_LE(
1982 copy_size,
1983 input_height_stride - width_block_number * input_width_micro_repeats);
1984 // We may drop up to stride-1 of trailing input.
1985 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1986
1987 int scratch_data_offset = 0;
1988 int input_block_offset = 0;
1989
1990 constexpr uint8 kSignBit =
1991 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1992
1993 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1994 // code. Note the blocks of 4x4 are still interleaved down the depth.
1995 int8x16_t work_reg;
1996 int8x8_t half_work_reg;
1997 int8x8_t padding_mask;
1998
1999 // Effect subtraction of zero-point = 128 by XOR of sign bit.
2000 const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
2001 const int8x16_t padding_reg =
2002 vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
2003 padding_mask = vdup_n_s8(-1);
2004 half_work_reg = vdup_n_s8(0);
2005
2006 if (copy_size >= 16) {
2007 const int copy_remaining = (copy_size + start_width) & 0x7;
2008 padding_mask = vreinterpret_s8_s64(vshl_s64(
2009 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2010
2011 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2012 // Work through one slice, by row, at a time.
2013 int8* scratch_data = scratch_data_base + scratch_data_offset;
2014
2015 int copy_done = 0;
2016
2017 // The surrounding condition ensures that we always need at least one
2018 // iteration of the main copy loop. In the case of leading width
2019 // padding, we unroll this specially.
2020 if (leading_width_padding) {
2021 work_reg = util_vld1q_x8(input_block_data + input_block_offset);
2022 work_reg = vextq_s8(padding_reg, work_reg, 15);
2023 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2024 work_reg = veorq_s8(work_reg, sign_bit);
2025 }
2026 vst1q_s8(scratch_data, work_reg);
2027 copy_done += 15;
2028 }
2029
2030 // Main copy loop.
2031 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2032 work_reg =
2033 util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2034 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2035 work_reg = veorq_s8(work_reg, sign_bit);
2036 }
2037 TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
2038 vst1q_s8(scratch_data + start_width + copy_done, work_reg);
2039 }
2040
2041 if (copy_done + 8 <= copy_size) {
2042 half_work_reg =
2043 util_vld1_x8(input_block_data + input_block_offset + copy_done);
2044 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2045 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2046 }
2047 TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2048 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2049 copy_done += 8;
2050 }
2051
2052 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2053 // Total amount
2054 // = copy_size - copy_done + 4 - adjusted_residual_width
2055 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2056 // Undone micro blocks
2057 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2058
2059 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2060 if (copy_done < copy_size_adjusted) {
2061 // Employ overlapping-load strategy in order to load full register,
2062 // but use only part.
2063 // This has the advantage of resulting in zeros after shifting.
2064 half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2065 copy_size - 8);
2066
2067 half_work_reg = vreinterpret_s8_s64(
2068 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2069 vdup_n_s64(-8 * (8 - copy_remaining))));
2070 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2071 vget_low_s8(padding_reg), half_work_reg);
2072
2073 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2074 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2075 }
2076 TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2077 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2078 }
2079
2080 // Trailing guard.
2081 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2082 vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
2083
2084 scratch_data_offset += workspace_height_stride;
2085 input_block_offset += input_height_stride;
2086 }
2087 } else if (copy_size >= 4) {
2088 const int copy_remaining = (copy_size + start_width) & 0x3;
2089 padding_mask = vreinterpret_s8_s64(vshl_s64(
2090 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2091
2092 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2093 // Work through one slice, by row, at a time.
2094 int8* scratch_data = scratch_data_base + scratch_data_offset;
2095
2096 int copy_done = 0;
2097
2098 // The surrounding condition ensures that we always need at least one
2099 // iteration of the main copy loop. In the case of leading width
2100 // padding, we unroll this specially.
2101 if (leading_width_padding) {
2102 half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
2103 half_work_reg, 0);
2104 half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
2105 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2106 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2107 }
2108 vst1_lane_s8x4(scratch_data, half_work_reg, 0);
2109 copy_done += 3;
2110 }
2111
2112 // Main copy loop.
2113 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2114 half_work_reg =
2115 vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2116 half_work_reg, 0);
2117 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2118 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2119 }
2120 TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2121 vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2122 0);
2123 }
2124
2125 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2126 // Total amount
2127 // = copy_size - copy_done + 4 - adjusted_residual_width
2128 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2129 // Undone micro blocks
2130 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2131
2132 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2133 if (copy_done < copy_size_adjusted) {
2134 TFLITE_DCHECK_LT(copy_remaining, 4);
2135 // Employ overlapping-load strategy in order to load full register,
2136 // but use only part.
2137 // This has the advantage of resulting in zeros after shifting.
2138 half_work_reg = vld1_lane_8x4(
2139 input_block_data + input_block_offset + copy_size - 4,
2140 half_work_reg, 0);
2141
2142 half_work_reg = vreinterpret_s8_s64(
2143 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2144 vdup_n_s64(-8 * (4 - copy_remaining))));
2145 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2146 vget_low_s8(padding_reg), half_work_reg);
2147
2148 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2149 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2150 }
2151 TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2152 vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2153 0);
2154 copy_done += 4;
2155 }
2156 // Trailing guard.
2157 vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
2158 0);
2159 vst1_lane_s8x4(scratch_data + start_width + copy_done + 4,
2160 half_work_reg, 0);
2161 vst1_lane_s8x4(scratch_data + start_width + copy_done + 8,
2162 half_work_reg, 0);
2163 vst1_lane_s8x4(scratch_data + start_width + copy_done + 12,
2164 half_work_reg, 0);
2165
2166 scratch_data_offset += workspace_height_stride;
2167 input_block_offset += input_height_stride;
2168 }
2169 } else if (width_overall_micro_repeats == 2) {
2170 // Special case of 1 + 3 + 1, padding + copy + padding.
2171 // This is rarely executed in practice.
2172 TFLITE_DCHECK_EQ(copy_size, 3);
2173 TFLITE_DCHECK_EQ(start_width, 1);
2174 TFLITE_DCHECK(leading_width_padding);
2175 TFLITE_DCHECK(trailing_width_padding);
2176
2177 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2178 half_work_reg = vreinterpret_s8_u8(vdup_n_u8(-input_offset));
2179 half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
2180 input_block_data + input_block_offset),
2181 half_work_reg, 1);
2182 half_work_reg =
2183 vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2184 input_block_offset + 1),
2185 half_work_reg, 2);
2186 half_work_reg =
2187 vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2188 input_block_offset + 2),
2189 half_work_reg, 3);
2190
2191 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2192 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2193 }
2194 TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
2195 vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
2196
2197 // Trailing guard.
2198 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2199 half_work_reg, 0);
2200 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2201 half_work_reg, 0);
2202 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2203 half_work_reg, 0);
2204 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2205 half_work_reg, 0);
2206
2207 scratch_data_offset += workspace_height_stride;
2208 input_block_offset += input_height_stride;
2209 }
2210 } else {
2211 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2212 const int copy_remaining = (copy_size + start_width) & 0x3;
2213 padding_mask = vreinterpret_s8_s64(vshl_s64(
2214 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2215 if (leading_width_padding) {
2216 padding_mask = vreinterpret_s8_u8(
2217 vset_lane_u8(255, vreinterpret_u8_s8(padding_mask), 0));
2218 }
2219
2220 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2221 for (int i = 0; i < copy_size; ++i) {
2222 half_work_reg = vreinterpret_s8_s64(
2223 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2224 half_work_reg = vld1_lane_s8(
2225 reinterpret_cast<const int8*>(
2226 input_block_data + input_block_offset + copy_size - 1 - i),
2227 half_work_reg, 0);
2228 }
2229 if (leading_width_padding) {
2230 half_work_reg = vreinterpret_s8_s64(
2231 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2232 }
2233 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2234 vget_low_s8(padding_reg), half_work_reg);
2235
2236 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2237 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2238 }
2239 TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2240 vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2241 0);
2242
2243 // Trailing guard.
2244 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2245 half_work_reg, 0);
2246 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2247 half_work_reg, 0);
2248 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2249 half_work_reg, 0);
2250 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2251 half_work_reg, 0);
2252
2253 scratch_data_offset += workspace_height_stride;
2254 input_block_offset += input_height_stride;
2255 }
2256 }
2257
2258 scratch_data_base += copy_block_height * workspace_height_stride;
2259
2260 if (trailing_height_padding) {
2261 memset(scratch_data_base, -input_offset_difference,
2262 workspace_height_stride + kWorkspaceExtension);
2263 scratch_data_base += workspace_height_stride;
2264 }
2265
2266 TFLITE_DCHECK_EQ(
2267 scratch_data_base,
2268 scratch_block_data + block_height * workspace_height_stride);
2269 }
2270
2271 static inline void Run(
2272 int32 height_block_number, int32 width_block_number,
2273 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2274 input_block_data,
2275 int8* scratch_block_data,
2276 const DepthwiseConvDotProdParams* function_params) {
2277 #ifdef __aarch64__
2278 PreloadInputBlock(input_block_data, function_params);
2279 #endif
2280
2281 PackMacroBlockIntrinsics(height_block_number, width_block_number,
2282 input_block_data, scratch_block_data,
2283 function_params);
2284 }
2285 };
2286
2287 template <QuantizationType quantization_type>
2288 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
2289 quantization_type,
2290 DepthwiseConvDepthMultiplication::kUnitInputDepth,
2291 /*max_padding=*/0> {
2292 static inline void PackMacroBlockIntrinsics(
2293 int32 height_block_number, int32 width_block_number,
2294 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2295 input_block_data,
2296 int8* scratch_block_data,
2297 const DepthwiseConvDotProdParams* function_params) {
2298 const int workspace_height_stride =
2299 function_params->workspace_height_stride;
2300 const int width_overall_micro_repeats =
2301 function_params->input_width_overall_micro_repeats;
2302 const int input_width_micro_repeats =
2303 function_params->input_width_micro_repeats;
2304 const int block_height = function_params->inbound_block_height;
2305 const int residual_width = function_params->residual_width;
2306 const int input_height_stride = function_params->input_height_stride;
2307
2308 TFLITE_DCHECK_EQ(function_params->padding_left, 0);
2309 TFLITE_DCHECK_EQ(function_params->padding_right, 0);
2310 TFLITE_DCHECK_EQ(function_params->padding_top, 0);
2311 TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
2312
2313 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
2314
2315 // Work through one slice, by row, at a time.
2316 int8* scratch_data_base = scratch_block_data;
2317
2318 const int copy_block_height = block_height;
2319
2320 int adjusted_residual_width =
2321 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
2322 : 4;
2323
2324 const int copy_size =
2325 (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
2326
2327 TFLITE_DCHECK_LE(
2328 copy_size,
2329 input_height_stride - width_block_number * input_width_micro_repeats);
2330 // We may drop up to stride-1 of trailing input.
2331 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
2332
2333 int scratch_data_offset = 0;
2334 int input_block_offset = 0;
2335
2336 constexpr uint8 kSignBit =
2337 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
2338
2339 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
2340 // code. Note the blocks of 4x4 are still interleaved down the depth.
2341 int8x16_t work_reg;
2342 int8x8_t half_work_reg;
2343
2344 // Effect subtraction of zero-point = 128 by XOR of sign bit.
2345 const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
2346 half_work_reg = vdup_n_s8(0);
2347
2348 if (copy_size >= 16) {
2349 const int copy_remaining = copy_size & 0x7;
2350
2351 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2352 // Work through one slice, by row, at a time.
2353 int8* scratch_data = scratch_data_base + scratch_data_offset;
2354
2355 int copy_done = 0;
2356
2357 // Main copy loop.
2358 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2359 work_reg =
2360 util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2361 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2362 work_reg = veorq_s8(work_reg, sign_bit);
2363 }
2364 TFLITE_DCHECK_EQ(copy_done % 16, 0);
2365 vst1q_s8(scratch_data + copy_done, work_reg);
2366 }
2367
2368 if (copy_done + 8 <= copy_size) {
2369 half_work_reg =
2370 util_vld1_x8(input_block_data + input_block_offset + copy_done);
2371 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2372 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2373 }
2374 TFLITE_DCHECK_EQ(copy_done % 8, 0);
2375 vst1_s8(scratch_data + copy_done, half_work_reg);
2376 copy_done += 8;
2377 }
2378
2379 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2380 // Total amount
2381 // = copy_size - copy_done + 4 - adjusted_residual_width
2382 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2383 // Undone micro blocks
2384 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2385
2386 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2387 if (copy_done < copy_size) {
2388 // Employ overlapping-load strategy in order to load full register,
2389 // but use only part.
2390 // This has the advantage of resulting in zeros after shifting.
2391 half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2392 copy_size - 8);
2393
2394 half_work_reg = vreinterpret_s8_s64(
2395 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2396 vdup_n_s64(-8 * (8 - copy_remaining))));
2397
2398 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2399 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2400 }
2401 TFLITE_DCHECK_EQ(copy_done % 8, 0);
2402 vst1_s8(scratch_data + copy_done, half_work_reg);
2403 copy_done += 8;
2404 }
2405
2406 // Trailing guard.
2407 vst1_s8(scratch_data + copy_done, half_work_reg);
2408 vst1_s8(scratch_data + copy_done + 8, half_work_reg);
2409
2410 scratch_data_offset += workspace_height_stride;
2411 input_block_offset += input_height_stride;
2412 }
2413 } else if (copy_size >= 4) {
2414 const int copy_remaining = copy_size & 0x3;
2415
2416 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2417 // Work through one slice, by row, at a time.
2418 int8* scratch_data = scratch_data_base + scratch_data_offset;
2419
2420 int copy_done = 0;
2421
2422 // Main copy loop.
2423 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2424 half_work_reg =
2425 vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2426 half_work_reg, 0);
2427 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2428 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2429 }
2430 TFLITE_DCHECK_EQ(copy_done % 4, 0);
2431 vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2432 }
2433
2434 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2435 // Total amount
2436 // = copy_size - copy_done + 4 - adjusted_residual_width
2437 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2438 // Undone micro blocks
2439 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2440
2441 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2442 if (copy_done < copy_size) {
2443 TFLITE_DCHECK_LT(copy_remaining, 4);
2444 // Employ overlapping-load strategy in order to load full register,
2445 // but use only part.
2446 // This has the advantage of resulting in zeros after shifting.
2447 half_work_reg = vld1_lane_8x4(
2448 input_block_data + input_block_offset + copy_size - 4,
2449 half_work_reg, 0);
2450
2451 half_work_reg = vreinterpret_s8_s64(
2452 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2453 vdup_n_s64(-8 * (4 - copy_remaining))));
2454
2455 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2456 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2457 }
2458 TFLITE_DCHECK_EQ(copy_done % 4, 0);
2459 vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2460 copy_done += 4;
2461 }
2462 // Trailing guard.
2463 vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
2464 vst1_lane_s8x4(scratch_data + copy_done + 4, half_work_reg, 0);
2465 vst1_lane_s8x4(scratch_data + copy_done + 8, half_work_reg, 0);
2466 vst1_lane_s8x4(scratch_data + copy_done + 12, half_work_reg, 0);
2467
2468 scratch_data_offset += workspace_height_stride;
2469 input_block_offset += input_height_stride;
2470 }
2471 } else {
2472 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2473
2474 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2475 for (int i = 0; i < copy_size; ++i) {
2476 half_work_reg = vreinterpret_s8_s64(
2477 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2478 half_work_reg = vld1_lane_s8(
2479 reinterpret_cast<const int8*>(
2480 input_block_data + input_block_offset + copy_size - 1 - i),
2481 half_work_reg, 0);
2482 }
2483
2484 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2485 TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2486 vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2487 0);
2488
2489 // Trailing guard.
2490 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
2491 half_work_reg, 0);
2492 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
2493 half_work_reg, 0);
2494 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
2495 half_work_reg, 0);
2496 vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
2497 half_work_reg, 0);
2498
2499 scratch_data_offset += workspace_height_stride;
2500 input_block_offset += input_height_stride;
2501 }
2502 }
2503
2504 scratch_data_base += copy_block_height * workspace_height_stride;
2505
2506 TFLITE_DCHECK_EQ(
2507 scratch_data_base,
2508 scratch_block_data + block_height * workspace_height_stride);
2509 }
2510
2511 static inline void Run(
2512 int32 height_block_number, int32 width_block_number,
2513 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2514 input_block_data,
2515 int8* scratch_block_data,
2516 const DepthwiseConvDotProdParams* function_params) {
2517 #ifdef __aarch64__
2518 PreloadInputBlock(input_block_data, function_params);
2519 #endif
2520
2521 PackMacroBlockIntrinsics(height_block_number, width_block_number,
2522 input_block_data, scratch_block_data,
2523 function_params);
2524 }
2525 };
2526
2527 #endif // ARM NEON
2528
2529 // Apply filter to macro block of input data and store results.
2530 //
2531 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2532 template <int32 stride, QuantizationType quantization_type>
2533 struct KernelMacroBlock<
2534 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2535 DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2536 // Construct a width-shifted combination of two input sub-blocks, effectively
2537 // concatenating them.
2538 //
2539 // The filter is applied using sub-blocks. These are in the needed form for
2540 // the first (width) offset. For subsequent offsets, the filter is applied to
2541 // shifted and combined data. The concatentation and shifting herein is fairly
2542 // straightforward, but in the optimized code is an area of creativity in
2543 // design because NEON instructions do not directly support the required
2544 // between-register permutation.
2545 //
2546 // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2547 // move along the width for each output point calculation, data is shifted, in
2548 // essence between two such blocks.
2549 //
2550 // selected_data has format height 3, depth 4, width 4.
2551 //
2552 // When the micro block is trailing (the last across the macro-block width),
2553 // it would be illegal to load the right (next) block, and the no_right_block
2554 // indicates this scenario.
2555 static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
2556 int workspace_height_stride,
2557 int width_micro_stride,
2558 bool no_right_block,
2559 const int8* input_block,
2560 int8 selected_data[3][4][4]) {
2561 TFLITE_DCHECK_GE(offset, 0);
2562 TFLITE_DCHECK_LT(offset, 4);
2563
2564 // The input banks have same format as selected_data.
2565 int8 left_bank[3][4][4];
2566 int8 right_bank[3][4][4];
2567
2568 // Work through one slice, by row, at a time.
2569 for (int k_height = 0; k_height < 3; ++k_height) {
2570 // Simulate demangling of mangled storage arrangement.
2571 const int8* left_input_block =
2572 &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
2573 memcpy(left_bank[k_height][0], left_input_block, 16);
2574 if (no_right_block) {
2575 memset(right_bank[k_height][0], 0, 16);
2576 } else {
2577 const int8* right_input_block =
2578 &input_block[k_height * workspace_height_stride +
2579 sub_block * 2 * 8 + width_micro_stride];
2580 memcpy(right_bank[k_height][0], right_input_block, 16);
2581 }
2582 for (int depth_index = 0; depth_index < 4; ++depth_index) {
2583 memcpy(selected_data[k_height][depth_index],
2584 &left_bank[k_height][depth_index][offset], 4 - offset);
2585 memcpy(&selected_data[k_height][depth_index][4 - offset],
2586 right_bank[k_height][depth_index], offset);
2587 }
2588 }
2589 }
2590
2591 // Straight implementation of 3x3 filter within sub-micro block.
2592 static inline void Calculate3x3FilterOutput(
2593 const DepthwiseConvDotProdParams& params, int sub_block,
2594 const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
2595 const int32* bias_data, uint8 output_values[4]) {
2596 const int32 output_activation_min = params.quantized_activation_min;
2597 const int32 output_activation_max = params.quantized_activation_max;
2598 const int32 output_multiplier = params.output_multiplier;
2599 const int32 output_shift = params.output_shift;
2600 const int32 output_offset = params.output_offset;
2601 for (int d = 0; d < 4; ++d) {
2602 int32 acc = 0;
2603 for (int y = 0; y < 3; ++y) {
2604 for (int x = 0; x < 4; ++x) {
2605 int32 input_val = selected_data[y][d][x];
2606 int32 filter_val = filter_bank[y][sub_block][d][x];
2607 acc += filter_val * input_val;
2608 }
2609 }
2610 acc += bias_data[d];
2611 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2612 DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2613 output_shift);
2614 acc += output_offset;
2615 acc = std::max(acc, output_activation_min);
2616 acc = std::min(acc, output_activation_max);
2617 output_values[d] = static_cast<uint8>(acc);
2618 }
2619 }
2620
2621 static inline void Run(const int8* scratch_block_data,
2622 const int8* filter_workspace, const int32* bias_data,
2623 uint8* output_block_data,
2624 const DepthwiseConvDotProdParams* function_params) {
2625 const int workspace_height_stride =
2626 function_params->workspace_height_stride;
2627 const int input_width_overall_micro_repeats =
2628 function_params->input_width_overall_micro_repeats;
2629 const int output_width_micro_repeats =
2630 function_params->output_width_micro_repeats;
2631 const int depth_micro_repeats = function_params->depth_micro_repeats;
2632 const int depth = function_params->input_depth;
2633 const int stride_val = function_params->stride;
2634 const int four_over_stride = function_params->four_over_stride;
2635
2636 const int output_width_overall_micro_repeats =
2637 function_params->output_width_overall_micro_repeats;
2638 const int block_height = function_params->outbound_block_height;
2639 const int residual_width = function_params->output_residual_width;
2640 const int output_height_stride = function_params->output_height_stride;
2641 constexpr int bias_increment = 4;
2642 TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2643
2644 TFLITE_DCHECK(depth_micro_repeats > 0);
2645 const int width_micro_stride = 4 * 8;
2646 const int depth_micro_stride =
2647 width_micro_stride * input_width_overall_micro_repeats;
2648
2649 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2650
2651 // Simulate NEON-register transposition of subset of filter.
2652 int8 filter_bank[3][2][4][4]; // Height 3, sub-block, depth 4, width 4.
2653 // Simulate NEON-register input data concatenation + sub-selection.
2654 int8 sub_selected_input_data[3][4][4]; // Height 3, depth 4, width 4.
2655 uint8 output_values[4]; // Depth 4.
2656
2657 // The outer 3 loops go through all the micro blocks in a macro block, and
2658 // separately treat the two sub-blocks within each micro block.
2659 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2660 memcpy(filter_bank[0][0][0],
2661 filter_workspace + j_depth * shuffled_filter_increment,
2662 shuffled_filter_increment);
2663
2664 for (int s = 0; s < 2; ++s) {
2665 for (int k_height = 0; k_height < block_height; ++k_height) {
2666 const int8* scratch_data =
2667 scratch_block_data +
2668 workspace_height_stride * k_height * stride_val +
2669 depth_micro_stride * j_depth;
2670 uint8* output_data =
2671 output_block_data + output_height_stride * k_height + 8 * j_depth;
2672
2673 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2674 ++i_width) {
2675 const int output_width = i_width == output_width_micro_repeats
2676 ? residual_width
2677 : four_over_stride;
2678 const bool no_right_block = (output_width - 1) * stride_val < 2;
2679 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2680 const int8* input_data =
2681 scratch_data + width_micro_stride * i_width;
2682 // Iterate over input width shifts within sub-micro blocks.
2683 for (int x = 0; x < output_width; ++x) {
2684 ConcatenateInputSubBlocks(x * stride_val, s,
2685 workspace_height_stride,
2686 width_micro_stride, no_right_block,
2687 input_data, sub_selected_input_data);
2688 Calculate3x3FilterOutput(
2689 *function_params, s, sub_selected_input_data, filter_bank,
2690 bias_data + (2 * j_depth + s) * bias_increment,
2691 output_values);
2692 for (int d = 0; d < 4; ++d) {
2693 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2694 d] = output_values[d];
2695 }
2696 }
2697 }
2698 }
2699 }
2700 }
2701 }
2702 };
2703
2704 // Apply filter to macro block of input data and store results.
2705 //
2706 // Parameters for repeats and residual sizes are in terms of outputs.
2707 //
2708 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2709 template <int32 stride, QuantizationType quantization_type>
2710 struct KernelMacroBlock<
2711 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2712 DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
2713 // Construct a width-shifted combination of two input sub-blocks, effectively
2714 // concatenating them.
2715 //
2716 // The filter is applied using sub-blocks. These are in the needed form for
2717 // the first (width) offset. For subsequent offsets, the filter is applied to
2718 // shifted and combined data. The concatentation and shifting herein is fairly
2719 // straightforward, but in the optimized code is an area of creativity in
2720 // design because NEON instructions do not directly support the required
2721 // between-register permutation.
2722 //
2723 // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2724 // move along the width for each output point calculation, data is shifted, in
2725 // essence between two such blocks.
2726 //
2727 // selected_data has format height 3, width 4.
2728 //
2729 // When the micro block is trailing (the last across the macro-block width),
2730 // it would be illegal to load the right (next) block, and the no_right_block
2731 // indicates this scenario.
2732 static inline void ConcatenateInputSubBlocks(int offset,
2733 int workspace_height_stride,
2734 bool no_right_block,
2735 const int8* input_block,
2736 int8 selected_data[3][4]) {
2737 TFLITE_DCHECK_GE(offset, 0);
2738 TFLITE_DCHECK_LT(offset, 4);
2739 if (no_right_block) {
2740 for (int k_height = 0; k_height < 3; ++k_height) {
2741 memcpy(selected_data[k_height],
2742 &input_block[k_height * workspace_height_stride + offset],
2743 4 - offset);
2744 }
2745 } else {
2746 for (int k_height = 0; k_height < 3; ++k_height) {
2747 memcpy(selected_data[k_height],
2748 &input_block[k_height * workspace_height_stride + offset], 4);
2749 }
2750 }
2751 }
2752
2753 // Straight implementation of 3x3 filter within sub-micro block.
2754 static inline void Calculate3x3FilterOutput(
2755 const DepthwiseConvDotProdParams& function_params, int sub_block,
2756 const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
2757 const int32* bias_data, uint8 output_values[4]) {
2758 const int32 output_activation_min =
2759 function_params.quantized_activation_min;
2760 const int32 output_activation_max =
2761 function_params.quantized_activation_max;
2762 const int32 output_multiplier = function_params.output_multiplier;
2763 const int32 output_shift = function_params.output_shift;
2764 const int32 output_offset = function_params.output_offset;
2765 for (int d = 0; d < 4; ++d) {
2766 int32 acc = 0;
2767 for (int y = 0; y < 3; ++y) {
2768 for (int x = 0; x < 4; ++x) {
2769 int32 input_val = selected_data[y][x];
2770 int32 filter_val = filter_bank[y][sub_block][d][x];
2771 acc += filter_val * input_val;
2772 }
2773 }
2774 acc += bias_data[d];
2775 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2776 DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2777 output_shift);
2778 acc += output_offset;
2779 acc = std::max(acc, output_activation_min);
2780 acc = std::min(acc, output_activation_max);
2781 output_values[d] = static_cast<uint8>(acc);
2782 }
2783 }
2784
2785 static inline void Run(const int8* scratch_block_data,
2786 const int8* filter_workspace, const int32* bias_data,
2787 uint8* output_block_data,
2788 const DepthwiseConvDotProdParams* function_params) {
2789 const int workspace_height_stride =
2790 function_params->workspace_height_stride;
2791 const int output_width_micro_repeats =
2792 function_params->output_width_micro_repeats;
2793 const int depth_micro_repeats = function_params->depth_micro_repeats;
2794 const int depth = function_params->output_depth;
2795 const int stride_val = function_params->stride;
2796 const int four_over_stride = function_params->four_over_stride;
2797
2798 const int workspace_width_micro_repeats =
2799 function_params->workspace_width_micro_repeats;
2800 const int output_width_overall_micro_repeats =
2801 function_params->output_width_overall_micro_repeats;
2802 const int block_height = function_params->outbound_block_height;
2803 const int residual_width = function_params->output_residual_width;
2804 const int output_height_stride = function_params->output_height_stride;
2805 constexpr int bias_increment = 4;
2806 TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2807
2808 TFLITE_DCHECK(depth_micro_repeats > 0);
2809
2810 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2811
2812 // Simulate NEON-register transposition of subset of filter.
2813 int8 filter_bank[3][2][4][4]; // Height 3, sub-block, depth 4, width 4.
2814 // Simulate NEON-register input data concatenation + sub-selection.
2815 int8 sub_selected_input_data[3][4]; // Height 3, depth 4, width 4.
2816 uint8 output_values[4]; // Depth 4.
2817
2818 // The outer 3 loops go through all the micro blocks in a macro block, and
2819 // separately treat the two sub-blocks within each micro block.
2820 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2821 memcpy(filter_bank[0][0][0],
2822 filter_workspace + j_depth * shuffled_filter_increment,
2823 shuffled_filter_increment);
2824
2825 for (int s = 0; s < 2; ++s) {
2826 for (int k_height = 0; k_height < block_height; ++k_height) {
2827 const int8* scratch_data =
2828 scratch_block_data +
2829 workspace_height_stride * k_height * stride_val;
2830 uint8* output_data =
2831 output_block_data + output_height_stride * k_height + 8 * j_depth;
2832
2833 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2834 ++i_width) {
2835 const int output_width = i_width == output_width_micro_repeats
2836 ? residual_width
2837 : four_over_stride;
2838 const bool no_right_block = i_width == output_width_micro_repeats &&
2839 output_width_overall_micro_repeats ==
2840 workspace_width_micro_repeats;
2841 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2842 const int8* input_data = scratch_data + 4 * i_width;
2843 // Iterate over input width shifts within 4x4 blocks.
2844 for (int x = 0; x < output_width; ++x) {
2845 ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
2846 no_right_block, input_data,
2847 sub_selected_input_data);
2848 Calculate3x3FilterOutput(
2849 *function_params, s, sub_selected_input_data, filter_bank,
2850 bias_data + (2 * j_depth + s) * bias_increment,
2851 output_values);
2852 for (int d = 0; d < 4; ++d) {
2853 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2854 d] = output_values[d];
2855 }
2856 }
2857 }
2858 }
2859 }
2860 }
2861 }
2862 };
2863
2864 // Beginning of code section containing intermediate code transformation.
2865 //
2866 // This section is only compiled when kUseUnwound3x3DotProduct versions of
2867 // templated functions are selected.
2868 template <int32 stride, QuantizationType quantization_type>
2869 struct KernelMacroBlock<
2870 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
2871 DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2872 static inline void Run(const int8* scratch_block_data,
2873 const int8* filter_workspace, const int32* bias_data,
2874 uint8* output_block_data,
2875 const DepthwiseConvDotProdParams* function_params) {
2876 const int workspace_height_stride =
2877 function_params->workspace_height_stride;
2878 const int input_width_overall_micro_repeats =
2879 function_params->input_width_overall_micro_repeats;
2880 const int output_width_micro_repeats =
2881 function_params->output_width_micro_repeats;
2882 const int depth_micro_repeats = function_params->depth_micro_repeats;
2883 const int depth = function_params->input_depth;
2884 const int stride_val = function_params->stride;
2885 const int four_over_stride = function_params->four_over_stride;
2886
2887 const int output_width_overall_micro_repeats =
2888 function_params->output_width_overall_micro_repeats;
2889 const int block_height = function_params->outbound_block_height;
2890 const int residual_width = function_params->output_residual_width;
2891 const int output_height_stride = function_params->output_height_stride;
2892 const int bias_increment = function_params->bias_increment;
2893
2894 TFLITE_DCHECK(depth_micro_repeats > 0);
2895 const int width_micro_stride = 4 * 8;
2896 const int depth_micro_stride =
2897 width_micro_stride * input_width_overall_micro_repeats;
2898
2899 const int32 output_activation_min =
2900 function_params->quantized_activation_min;
2901 const int32 output_activation_max =
2902 function_params->quantized_activation_max;
2903 const int32 output_multiplier = function_params->output_multiplier;
2904 const int32 output_shift = function_params->output_shift;
2905 const int32 output_offset = function_params->output_offset;
2906
2907 // Simulate NEON-register transposition of subset of filter.
2908 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
2909 int8 filter_bank_a_1[4][4];
2910 int8 filter_bank_a_2[4][4];
2911 int8 filter_bank_b_0[4][4];
2912 int8 filter_bank_b_1[4][4];
2913 int8 filter_bank_b_2[4][4];
2914 // Simulate NEON-register input data concatenation + sub-selection.
2915 // Also sub-block, height 3, depth 4, width 4.
2916 uint8 output_values[4]; // Sub-block, depth 4.
2917 // selected_data has format Depth 4, width 4.
2918 int8 left_bank_0[4][4];
2919 int8 left_bank_1[4][4];
2920 int8 left_bank_2[4][4];
2921 int8 right_bank_0[4][4];
2922 int8 right_bank_1[4][4];
2923 int8 right_bank_2[4][4];
2924 memset(right_bank_0[0], 0, 16);
2925 memset(right_bank_1[0], 0, 16);
2926 memset(right_bank_2[0], 0, 16);
2927
2928 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2929
2930 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2931 const int8* filter_block =
2932 filter_workspace + shuffled_filter_increment * j_depth;
2933
2934 memcpy(filter_bank_a_0, filter_block, 16);
2935 memcpy(filter_bank_b_0, filter_block + 16, 16);
2936 memcpy(filter_bank_a_1, filter_block + 32, 16);
2937 memcpy(filter_bank_b_1, filter_block + 48, 16);
2938 memcpy(filter_bank_a_2, filter_block + 64, 16);
2939 memcpy(filter_bank_b_2, filter_block + 80, 16);
2940
2941 for (int s = 0; s < 2; ++s) {
2942 // Work through one slice, by row, at a time.
2943 for (int k_height = 0; k_height < block_height; ++k_height) {
2944 const int8* scratch_data =
2945 scratch_block_data +
2946 workspace_height_stride * k_height * stride_val +
2947 depth_micro_stride * j_depth;
2948 uint8* output_data =
2949 output_block_data + output_height_stride * k_height + 8 * j_depth;
2950 const int8* input_data_0 = scratch_data + s * 2 * 8;
2951
2952 // Load first sub-micro block of data into operational banks.
2953 memcpy(left_bank_0[0], input_data_0, 16);
2954 memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
2955 memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
2956 16);
2957
2958 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2959 ++i_width) {
2960 const int output_width = i_width == output_width_micro_repeats
2961 ? residual_width
2962 : four_over_stride;
2963 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2964 const int8* input_data =
2965 input_data_0 + width_micro_stride * i_width;
2966 const bool no_right_block = (output_width - 1) * stride_val < 2;
2967
2968 // Load next sub-micro block of data.
2969 if (!no_right_block) {
2970 memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
2971 memcpy(right_bank_1[0],
2972 input_data + workspace_height_stride + width_micro_stride,
2973 16);
2974 memcpy(
2975 right_bank_2[0],
2976 input_data + 2 * workspace_height_stride + width_micro_stride,
2977 16);
2978 }
2979
2980 // Iterate over input width shifts within 4x4 blocks.
2981 for (int x = 0; x < output_width; ++x) {
2982 // Operate on depth of 4 in batches.
2983 for (int d = 0; d < 4; ++d) {
2984 int32 acc = 0;
2985 for (int x = 0; x < 4; ++x) {
2986 int32 input_val = left_bank_0[d][x];
2987 int32 filter_val = filter_bank_a_0[d][x];
2988 acc += filter_val * input_val;
2989 }
2990 for (int x = 0; x < 4; ++x) {
2991 int32 input_val = left_bank_1[d][x];
2992 int32 filter_val = filter_bank_a_1[d][x];
2993 acc += filter_val * input_val;
2994 }
2995 for (int x = 0; x < 4; ++x) {
2996 int32 input_val = left_bank_2[d][x];
2997 int32 filter_val = filter_bank_a_2[d][x];
2998 acc += filter_val * input_val;
2999 }
3000 acc += bias_data[d];
3001 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3002 DepthwiseConvOutputRounding::kUpward>(
3003 acc, output_multiplier, output_shift);
3004 acc += output_offset;
3005 acc = std::max(acc, output_activation_min);
3006 acc = std::min(acc, output_activation_max);
3007 output_values[d] = static_cast<uint8>(acc);
3008 }
3009
3010 for (int d = 0; d < 4; ++d) {
3011 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
3012 d] = output_values[d];
3013 }
3014
3015 // Simulate shifting instructions.
3016 if (stride_val == 1) {
3017 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3018 for (int z = 0; z < 3; ++z) {
3019 left_bank_0[depth_index][z] =
3020 left_bank_0[depth_index][z + 1];
3021 left_bank_1[depth_index][z] =
3022 left_bank_1[depth_index][z + 1];
3023 left_bank_2[depth_index][z] =
3024 left_bank_2[depth_index][z + 1];
3025 }
3026 left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
3027 left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
3028 left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
3029 for (int z = 0; z < 3; ++z) {
3030 right_bank_0[depth_index][z] =
3031 right_bank_0[depth_index][z + 1];
3032 right_bank_1[depth_index][z] =
3033 right_bank_1[depth_index][z + 1];
3034 right_bank_2[depth_index][z] =
3035 right_bank_2[depth_index][z + 1];
3036 }
3037 }
3038 } else {
3039 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3040 for (int z = 0; z < 2; ++z) {
3041 left_bank_0[depth_index][z] =
3042 left_bank_0[depth_index][z + 2];
3043 left_bank_1[depth_index][z] =
3044 left_bank_1[depth_index][z + 2];
3045 left_bank_2[depth_index][z] =
3046 left_bank_2[depth_index][z + 2];
3047 }
3048 left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
3049 left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
3050 left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
3051 left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
3052 left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
3053 left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
3054 for (int z = 0; z < 2; ++z) {
3055 right_bank_0[depth_index][z] =
3056 right_bank_0[depth_index][z + 2];
3057 right_bank_1[depth_index][z] =
3058 right_bank_1[depth_index][z + 2];
3059 right_bank_2[depth_index][z] =
3060 right_bank_2[depth_index][z + 2];
3061 }
3062 }
3063 }
3064 }
3065 }
3066 }
3067 bias_data += bias_increment;
3068
3069 // Move filter for second sub-block into operational filter.
3070 for (int z = 0; z < 4; ++z) {
3071 for (int x = 0; x < 4; ++x) {
3072 filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
3073 filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
3074 filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
3075 }
3076 }
3077 }
3078 }
3079 }
3080 };
3081
3082 template <int32 stride, QuantizationType quantization_type>
3083 struct KernelMacroBlock<
3084 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
3085 DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
3086 static inline void Run(const int8* scratch_block_data,
3087 const int8* filter_workspace, const int32* bias_data,
3088 uint8* output_block_data,
3089 const DepthwiseConvDotProdParams* function_params) {
3090 const int workspace_height_stride =
3091 function_params->workspace_height_stride;
3092 const int output_width_micro_repeats =
3093 function_params->output_width_micro_repeats;
3094 const int depth_micro_repeats = function_params->depth_micro_repeats;
3095 const int output_depth = function_params->output_depth;
3096 const int stride_val = function_params->stride;
3097 const int four_over_stride = function_params->four_over_stride;
3098
3099 const int output_width_overall_micro_repeats =
3100 function_params->output_width_overall_micro_repeats;
3101 const int block_height = function_params->outbound_block_height;
3102 const int residual_width = function_params->output_residual_width;
3103 const int output_height_stride = function_params->output_height_stride;
3104 const int bias_increment = function_params->bias_increment;
3105
3106 const int32 output_activation_min =
3107 function_params->quantized_activation_min;
3108 const int32 output_activation_max =
3109 function_params->quantized_activation_max;
3110 const int32 output_multiplier = function_params->output_multiplier;
3111 const int32 output_shift = function_params->output_shift;
3112 const int32 output_offset = function_params->output_offset;
3113
3114 TFLITE_DCHECK(depth_micro_repeats > 0);
3115
3116 TFLITE_DCHECK_EQ(bias_increment, 4);
3117
3118 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3119
3120 // Simulate NEON-register transposition of subset of filter.
3121 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
3122 int8 filter_bank_a_1[4][4];
3123 int8 filter_bank_a_2[4][4];
3124 int8 filter_bank_b_0[4][4];
3125 int8 filter_bank_b_1[4][4];
3126 int8 filter_bank_b_2[4][4];
3127 // Simulate NEON-register input data concatenation + sub-selection.
3128 // Also sub-block, height 3, depth 4, width 4.
3129
3130 int8 input_bank_0[8];
3131 int8 input_bank_1[8];
3132 int8 input_bank_2[8];
3133
3134 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
3135
3136 uint8 output_values[2][4]; // Sub-block, depth 4.
3137
3138 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3139 memcpy(filter_bank_a_0, filter_workspace, 16);
3140 memcpy(filter_bank_b_0, filter_workspace + 16, 16);
3141 memcpy(filter_bank_a_1, filter_workspace + 32, 16);
3142 memcpy(filter_bank_b_1, filter_workspace + 48, 16);
3143 memcpy(filter_bank_a_2, filter_workspace + 64, 16);
3144 memcpy(filter_bank_b_2, filter_workspace + 80, 16);
3145
3146 // Work through one slice, by row, at a time.
3147 for (int k_height = 0; k_height < block_height; ++k_height) {
3148 const int8* scratch_data =
3149 scratch_block_data +
3150 workspace_height_stride * k_height * stride_val;
3151 uint8* output_data =
3152 output_block_data + output_height_stride * k_height + 8 * j_depth;
3153
3154 memcpy(input_bank_0, scratch_data, 4);
3155 memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
3156 memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
3157
3158 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3159 ++i_width) {
3160 const int output_width = i_width == output_width_micro_repeats
3161 ? residual_width
3162 : four_over_stride;
3163
3164 TFLITE_DCHECK_LE(output_width * stride_val, 4);
3165 const int8* input_data = scratch_data + 4 * i_width;
3166
3167 memcpy(input_bank_0 + 4, input_data + 4, 4);
3168 memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
3169 memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
3170 4);
3171
3172 // Iterate over input width shifts within 4x4 blocks.
3173 for (int w = 0; w < output_width; ++w) {
3174 constexpr int offset =
3175 0; // Shift input instead of offset in multiply-accumulate.
3176
3177 {
3178 const int s = 0;
3179 for (int d = 0; d < 4; ++d) {
3180 int32 acc = bias_data[s * 4 + d];
3181 for (int x = 0; x < 4; ++x) {
3182 int32 input_val_0 = input_bank_0[offset + x];
3183 int32 filter_val_0 = filter_bank_a_0[d][x];
3184 acc += filter_val_0 * input_val_0;
3185 int32 input_val_1 = input_bank_1[offset + x];
3186 int32 filter_val_1 = filter_bank_a_1[d][x];
3187 acc += filter_val_1 * input_val_1;
3188 int32 input_val_2 = input_bank_2[offset + x];
3189 int32 filter_val_2 = filter_bank_a_2[d][x];
3190 acc += filter_val_2 * input_val_2;
3191 }
3192 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3193 DepthwiseConvOutputRounding::kUpward>(
3194 acc, output_multiplier, output_shift);
3195 acc += output_offset;
3196 acc = std::max(acc, output_activation_min);
3197 acc = std::min(acc, output_activation_max);
3198 output_values[s][d] = static_cast<uint8>(acc);
3199
3200 output_data[s * 4 + d] = output_values[s][d];
3201 }
3202 }
3203 {
3204 const int s = 1;
3205 for (int d = 0; d < 4; ++d) {
3206 int32 acc = bias_data[s * 4 + d];
3207 for (int x = 0; x < 4; ++x) {
3208 int32 input_val_0 = input_bank_0[offset + x];
3209 int32 filter_val_0 = filter_bank_b_0[d][x];
3210 acc += filter_val_0 * input_val_0;
3211 int32 input_val_1 = input_bank_1[offset + x];
3212 int32 filter_val_1 = filter_bank_b_1[d][x];
3213 acc += filter_val_1 * input_val_1;
3214 int32 input_val_2 = input_bank_2[offset + x];
3215 int32 filter_val_2 = filter_bank_b_2[d][x];
3216 acc += filter_val_2 * input_val_2;
3217 }
3218 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3219 DepthwiseConvOutputRounding::kUpward>(
3220 acc, output_multiplier, output_shift);
3221 acc += output_offset;
3222 acc = std::max(acc, output_activation_min);
3223 acc = std::min(acc, output_activation_max);
3224 output_values[s][d] = static_cast<uint8>(acc);
3225
3226 output_data[s * 4 + d] = output_values[s][d];
3227 }
3228 }
3229
3230 // Simulate register shifts.
3231 for (int i = 0; i < (8 - stride_val); ++i) {
3232 input_bank_0[i] = input_bank_0[i + stride_val];
3233 input_bank_1[i] = input_bank_1[i + stride_val];
3234 input_bank_2[i] = input_bank_2[i + stride_val];
3235 }
3236
3237 output_data += output_depth;
3238 }
3239 }
3240 }
3241 bias_data += 2 * bias_increment;
3242 filter_workspace += shuffled_filter_increment;
3243 }
3244 }
3245 };
3246 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
3247 // of templated functions are selected.
3248 //
3249 // End of code section containing intermediate code transformation.
3250
3251 #ifdef USE_NEON
3252 template <>
3253 struct KernelMacroBlock<
3254 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3255 QuantizationType::kNonPerChannelUint8,
3256 DepthwiseConvDepthMultiplication::kNoMultiplication,
3257 /*stride=*/1> {
3258 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3259 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3260 return vmin_u8(a, b);
3261 }
3262 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3263 return vmax_u8(a, b);
3264 }
3265 static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
3266 return vminq_u8(a, b);
3267 }
3268 static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
3269 return vmaxq_u8(a, b);
3270 }
3271
3272 static inline void KernelMacroBlockIntrinsics(
3273 const int8* scratch_block_data, const int8* filter_workspace,
3274 const int32* bias_data, uint8* output_block_data,
3275 const DepthwiseConvDotProdParams* function_params) {
3276 static constexpr QuantizationType quantization_type =
3277 QuantizationType::kNonPerChannelUint8;
3278
3279 const int workspace_height_stride =
3280 function_params->workspace_height_stride;
3281 const int input_width_overall_micro_repeats =
3282 function_params->input_width_overall_micro_repeats;
3283 const int output_width_micro_repeats =
3284 function_params->output_width_micro_repeats;
3285 const int depth_micro_repeats = function_params->depth_micro_repeats;
3286 const int depth = function_params->input_depth;
3287
3288 const int output_width_overall_micro_repeats =
3289 function_params->output_width_overall_micro_repeats;
3290 const int block_height = function_params->outbound_block_height;
3291 const int residual_width = function_params->output_residual_width;
3292 const int output_height_stride = function_params->output_height_stride;
3293 constexpr int kBiasIncrement = 4;
3294
3295 TFLITE_DCHECK(depth_micro_repeats > 0);
3296 const int width_micro_stride = 4 * 8;
3297 const int depth_micro_stride =
3298 width_micro_stride * input_width_overall_micro_repeats;
3299
3300 const int32 output_activation_min =
3301 function_params->quantized_activation_min;
3302 const int32 output_activation_max =
3303 function_params->quantized_activation_max;
3304 const int32 output_multiplier = function_params->output_multiplier;
3305 const int32 output_shift = function_params->output_shift;
3306 const int32 output_offset = function_params->output_offset;
3307 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3308 TFLITE_DCHECK_GE(output_activation_min, 0);
3309 TFLITE_DCHECK_LT(output_activation_min, 256);
3310 TFLITE_DCHECK_GE(output_activation_max, 0);
3311 TFLITE_DCHECK_LT(output_activation_max, 256);
3312 } else {
3313 TFLITE_DCHECK_GE(output_activation_min, -128);
3314 TFLITE_DCHECK_LT(output_activation_min, 128);
3315 TFLITE_DCHECK_GE(output_activation_max, -128);
3316 TFLITE_DCHECK_LT(output_activation_max, 128);
3317 }
3318 TFLITE_DCHECK_GE(output_offset, -32878);
3319 TFLITE_DCHECK_LT(output_offset, 32768);
3320
3321 const int16x8_t output_offset_vec =
3322 vdupq_n_s16(static_cast<int16>(output_offset));
3323 const uint8x16_t output_activation_min_vec =
3324 vdupq_n_u8(static_cast<uint8>(output_activation_min));
3325 const uint8x16_t output_activation_max_vec =
3326 vdupq_n_u8(static_cast<uint8>(output_activation_max));
3327
3328 const int8* input_data_depthwise = scratch_block_data;
3329 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3330 output_data_depthwise = output_block_data;
3331 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3332 // Simulate NEON-register transposition of subset of filter.
3333 int8x16_t filter_reg_0_a;
3334 int8x16_t filter_reg_0_b;
3335 int8x16_t filter_reg_1_a;
3336 int8x16_t filter_reg_1_b;
3337 int8x16_t filter_reg_2_a;
3338 int8x16_t filter_reg_2_b;
3339 int8x16_t filter_reg_0_a_shifted;
3340 int8x16_t filter_reg_1_a_shifted;
3341 int8x16_t filter_reg_2_a_shifted;
3342
3343 filter_reg_0_a = vld1q_s8(filter_workspace);
3344 filter_workspace += 16;
3345 filter_reg_0_b = vld1q_s8(filter_workspace);
3346 filter_workspace += 16;
3347 filter_reg_1_a = vld1q_s8(filter_workspace);
3348 filter_workspace += 16;
3349 filter_reg_1_b = vld1q_s8(filter_workspace);
3350 filter_workspace += 16;
3351 filter_reg_2_a = vld1q_s8(filter_workspace);
3352 filter_workspace += 16;
3353 filter_reg_2_b = vld1q_s8(filter_workspace);
3354 filter_workspace += 16;
3355
3356 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
3357 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
3358 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
3359 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
3360 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
3361 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
3362
3363 if (block_height == 4) {
3364 for (int s = 0; s < 2; ++s) {
3365 // Work through one slice, by row, at a time.
3366 const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
3367 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3368 output_data_base = output_data_depthwise + 4 * s;
3369
3370 const int8* next_input_data = input_data_base;
3371 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3372 output_data = output_data_base;
3373
3374 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3375 bias_data += kBiasIncrement;
3376
3377 // Load first sub-micro block of data into operational banks.
3378 int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
3379 int8x16_t left_bank_1_reg =
3380 vld1q_s8(next_input_data + workspace_height_stride);
3381 int8x16_t left_bank_2_reg =
3382 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3383 int8x16_t left_bank_3_reg =
3384 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3385 int8x16_t left_bank_4_reg =
3386 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3387 int8x16_t left_bank_5_reg =
3388 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3389
3390 int32x4_t acc0;
3391 int32x4_t acc1;
3392 int32x4_t acc2;
3393 int32x4_t acc3;
3394
3395 acc0 = adjusted_bias_data;
3396 acc1 = adjusted_bias_data;
3397 acc2 = adjusted_bias_data;
3398 acc3 = adjusted_bias_data;
3399
3400 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3401 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3402 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3403 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3404
3405 for (int i_width = 0; i_width < output_width_micro_repeats;
3406 ++i_width) {
3407 next_input_data += width_micro_stride;
3408
3409 // Iterate over input width shifts within 4x4 blocks.
3410 {
3411 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3412 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3413 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3414 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3415 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3416 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3417 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3418 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3419
3420 // Fixed-point multiplication.
3421 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3422 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3423 acc0, -output_shift);
3424 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3425 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3426 acc1, -output_shift);
3427 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3428 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3429 acc2, -output_shift);
3430 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3431 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3432 acc3, -output_shift);
3433 // Add the output offset.
3434 int16x8_t acc_s16_0_1 =
3435 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3436 int16x8_t acc_s16_2_3 =
3437 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3438 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3439 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3440 // Apply the activation function.
3441 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3442 vqmovxn_s16(acc_s16_2_3));
3443 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3444 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3445
3446 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3447 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3448 1);
3449 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3450 acc_u8_all, 2);
3451 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3452 acc_u8_all, 3);
3453
3454 output_data += depth;
3455 }
3456
3457 // Load next sub-micro block of data.
3458 int8x16_t right_bank_0_reg;
3459 int8x16_t right_bank_1_reg;
3460 int8x16_t right_bank_2_reg;
3461 int8x16_t right_bank_3_reg;
3462 int8x16_t right_bank_4_reg;
3463 int8x16_t right_bank_5_reg;
3464
3465 // Loading of next block always valid.
3466 right_bank_0_reg = vld1q_s8(next_input_data);
3467 right_bank_1_reg =
3468 vld1q_s8(next_input_data + workspace_height_stride);
3469 right_bank_2_reg =
3470 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3471 right_bank_3_reg =
3472 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3473 right_bank_4_reg =
3474 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3475 right_bank_5_reg =
3476 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3477
3478 {
3479 acc0 = adjusted_bias_data;
3480 acc1 = adjusted_bias_data;
3481 acc2 = adjusted_bias_data;
3482 acc3 = adjusted_bias_data;
3483
3484 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3485 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3486 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3487 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3488 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3489 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3490 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3491 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3492 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3493 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3494 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3495 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3496
3497 // Fixed-point multiplication.
3498 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3499 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3500 acc0, -output_shift);
3501 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3502 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3503 acc1, -output_shift);
3504 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3505 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3506 acc2, -output_shift);
3507 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3508 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3509 acc3, -output_shift);
3510 // Add the output offset.
3511 int16x8_t acc_s16_0_1 =
3512 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3513 int16x8_t acc_s16_2_3 =
3514 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3515 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3516 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3517 // Apply the activation function.
3518 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3519 vqmovxn_s16(acc_s16_2_3));
3520 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3521 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3522
3523 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3524 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3525 1);
3526 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3527 acc_u8_all, 2);
3528 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3529 acc_u8_all, 3);
3530
3531 left_bank_0_reg = vreinterpretq_s8_u16(
3532 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
3533 left_bank_1_reg = vreinterpretq_s8_u16(
3534 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
3535 left_bank_2_reg = vreinterpretq_s8_u16(
3536 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
3537 left_bank_3_reg = vreinterpretq_s8_u16(
3538 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
3539 left_bank_4_reg = vreinterpretq_s8_u16(
3540 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
3541 left_bank_5_reg = vreinterpretq_s8_u16(
3542 vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
3543 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
3544 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
3545 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
3546 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
3547 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
3548 vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
3549
3550 output_data += depth;
3551 }
3552
3553 {
3554 acc0 = adjusted_bias_data;
3555 acc1 = adjusted_bias_data;
3556 acc2 = adjusted_bias_data;
3557 acc3 = adjusted_bias_data;
3558
3559 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3560 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3561 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3562 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3563 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3564 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3565 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3566 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3567 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3568 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3569 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3570 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3571
3572 // Fixed-point multiplication.
3573 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3574 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3575 acc0, -output_shift);
3576 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3577 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3578 acc1, -output_shift);
3579 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3580 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3581 acc2, -output_shift);
3582 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3583 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3584 acc3, -output_shift);
3585 // Add the output offset.
3586 int16x8_t acc_s16_0_1 =
3587 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3588 int16x8_t acc_s16_2_3 =
3589 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3590 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3591 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3592 // Apply the activation function.
3593 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3594 vqmovxn_s16(acc_s16_2_3));
3595 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3596 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3597
3598 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3599 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3600 1);
3601 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3602 acc_u8_all, 2);
3603 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3604 acc_u8_all, 3);
3605
3606 output_data += depth;
3607 }
3608
3609 {
3610 acc0 = adjusted_bias_data;
3611 acc1 = adjusted_bias_data;
3612 acc2 = adjusted_bias_data;
3613 acc3 = adjusted_bias_data;
3614
3615 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3616 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3617 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3618 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3619 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3620 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3621 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3622 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3623 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3624 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3625 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3626 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3627
3628 // Fixed-point multiplication.
3629 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3630 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3631 acc0, -output_shift);
3632 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3633 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3634 acc1, -output_shift);
3635 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3636 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3637 acc2, -output_shift);
3638 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3639 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3640 acc3, -output_shift);
3641 // Add the output offset.
3642 int16x8_t acc_s16_0_1 =
3643 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3644 int16x8_t acc_s16_2_3 =
3645 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3646 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3647 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3648 // Apply the activation function.
3649 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3650 vqmovxn_s16(acc_s16_2_3));
3651 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3652 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3653
3654 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3655 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3656 1);
3657 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3658 acc_u8_all, 2);
3659 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3660 acc_u8_all, 3);
3661
3662 left_bank_0_reg = right_bank_0_reg;
3663 left_bank_1_reg = right_bank_1_reg;
3664 left_bank_2_reg = right_bank_2_reg;
3665 left_bank_3_reg = right_bank_3_reg;
3666 left_bank_4_reg = right_bank_4_reg;
3667 left_bank_5_reg = right_bank_5_reg;
3668
3669 output_data += depth;
3670 acc0 = adjusted_bias_data;
3671 acc1 = adjusted_bias_data;
3672 acc2 = adjusted_bias_data;
3673 acc3 = adjusted_bias_data;
3674
3675 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3676 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3677 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3678 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3679 }
3680 }
3681
3682 if (residual_width > 0) {
3683 next_input_data += width_micro_stride;
3684 const int output_width = residual_width;
3685
3686 // Load next sub-micro block of data.
3687 int8x16_t right_bank_0_reg;
3688 int8x16_t right_bank_1_reg;
3689 int8x16_t right_bank_2_reg;
3690 int8x16_t right_bank_3_reg;
3691 int8x16_t right_bank_4_reg;
3692 int8x16_t right_bank_5_reg;
3693 // Logic: (output_width - 1) * stride_val < 2.
3694 const bool no_right_block = output_width < 3;
3695
3696 if (no_right_block) {
3697 // Only needed for sanitizer checks.
3698 right_bank_0_reg = vdupq_n_s8(0);
3699 right_bank_1_reg = vdupq_n_s8(0);
3700 right_bank_2_reg = vdupq_n_s8(0);
3701 right_bank_3_reg = vdupq_n_s8(0);
3702 right_bank_4_reg = vdupq_n_s8(0);
3703 right_bank_5_reg = vdupq_n_s8(0);
3704 } else {
3705 right_bank_0_reg = vld1q_s8(next_input_data);
3706 right_bank_1_reg =
3707 vld1q_s8(next_input_data + workspace_height_stride);
3708 right_bank_2_reg =
3709 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3710 right_bank_3_reg =
3711 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3712 right_bank_4_reg =
3713 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3714 right_bank_5_reg =
3715 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3716 }
3717
3718 // Iterate over input width shifts within 4x4 blocks.
3719 for (int x = 0; x < output_width; ++x) {
3720 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3721 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3722 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3723 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3724 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3725 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3726 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3727 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3728
3729 // Fixed-point multiplication.
3730 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3731 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3732 acc0, -output_shift);
3733 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3734 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3735 acc1, -output_shift);
3736 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3737 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3738 acc2, -output_shift);
3739 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3740 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3741 acc3, -output_shift);
3742 // Add the output offset.
3743 int16x8_t acc_s16_0_1 =
3744 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3745 int16x8_t acc_s16_2_3 =
3746 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3747 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3748 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3749 // Apply the activation function.
3750 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3751 vqmovxn_s16(acc_s16_2_3));
3752 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3753 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3754
3755 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
3756 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
3757 1);
3758 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
3759 acc_u8_all, 2);
3760 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
3761 acc_u8_all, 3);
3762
3763 biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
3764 biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
3765 biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
3766 biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
3767 biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
3768 biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
3769
3770 output_data += depth;
3771
3772 acc0 = adjusted_bias_data;
3773 acc1 = adjusted_bias_data;
3774 acc2 = adjusted_bias_data;
3775 acc3 = adjusted_bias_data;
3776
3777 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3778 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3779 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3780 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3781 }
3782 }
3783 input_data_base += 4 * workspace_height_stride;
3784 output_data_base += 4 * output_height_stride;
3785
3786 // Move to next sub-block: advance to second set of filters, to new
3787 // bias.
3788 filter_reg_0_a = filter_reg_0_b;
3789 filter_reg_1_a = filter_reg_1_b;
3790 filter_reg_2_a = filter_reg_2_b;
3791 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
3792 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
3793 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
3794 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
3795 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
3796 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
3797 }
3798 } else {
3799 const int8* input_data_base = input_data_depthwise;
3800 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3801 output_data_base = output_data_depthwise;
3802
3803 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
3804 bias_data += kBiasIncrement;
3805 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
3806 bias_data += kBiasIncrement;
3807
3808 for (int k_height = 0; k_height < block_height; ++k_height) {
3809 const int8* next_input_data = input_data_base;
3810 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3811 output_data = output_data_base;
3812
3813 // Load first sub-micro block of data into operational banks.
3814 int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
3815 int8x16_t left_bank_1_reg_a =
3816 vld1q_s8(next_input_data + workspace_height_stride);
3817 int8x16_t left_bank_2_reg_a =
3818 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3819 int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3820 int8x16_t left_bank_1_reg_b =
3821 vld1q_s8(next_input_data + workspace_height_stride + 16);
3822 int8x16_t left_bank_2_reg_b =
3823 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3824
3825 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3826 ++i_width) {
3827 next_input_data += width_micro_stride;
3828 const int output_width =
3829 i_width == output_width_micro_repeats ? residual_width : 4;
3830
3831 int8x16_t right_bank_0_reg_a;
3832 int8x16_t right_bank_1_reg_a;
3833 int8x16_t right_bank_2_reg_a;
3834 int8x16_t right_bank_0_reg_b;
3835 int8x16_t right_bank_1_reg_b;
3836 int8x16_t right_bank_2_reg_b;
3837 // Logic: (output_width - 1) * stride_val < 2.
3838 const bool no_right_block = output_width < 3;
3839
3840 // Load next sub-micro block of data.
3841 if (no_right_block) {
3842 // Only needed for sanitizer checks.
3843 right_bank_0_reg_a = vdupq_n_s8(0);
3844 right_bank_1_reg_a = vdupq_n_s8(0);
3845 right_bank_2_reg_a = vdupq_n_s8(0);
3846 right_bank_0_reg_b = vdupq_n_s8(0);
3847 right_bank_1_reg_b = vdupq_n_s8(0);
3848 right_bank_2_reg_b = vdupq_n_s8(0);
3849 } else {
3850 right_bank_0_reg_a = vld1q_s8(next_input_data);
3851 right_bank_1_reg_a =
3852 vld1q_s8(next_input_data + workspace_height_stride);
3853 right_bank_2_reg_a =
3854 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3855 right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3856 right_bank_1_reg_b =
3857 vld1q_s8(next_input_data + workspace_height_stride + 16);
3858 right_bank_2_reg_b =
3859 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3860 }
3861
3862 // Iterate over input width shifts within 4x4 blocks.
3863 for (int x = 0; x < output_width; ++x) {
3864 int32x4_t acc_a = adjusted_bias_data_a;
3865 int32x4_t acc_b = adjusted_bias_data_b;
3866 acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
3867 acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
3868 acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
3869 acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
3870 acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
3871 acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
3872
3873 // Fixed-point multiplication.
3874 acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
3875 acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
3876 acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3877 acc_a, -output_shift);
3878 acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3879 acc_b, -output_shift);
3880 // Add the output offset.
3881 int16x8_t acc_s16_0_0 =
3882 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
3883 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
3884 // Apply the activation function.
3885 uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
3886 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
3887 vget_low_u8(output_activation_min_vec));
3888 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
3889 vget_low_u8(output_activation_max_vec));
3890
3891 util_vst1_u8(output_data, acc_u8_0_0);
3892
3893 biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
3894 biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
3895 biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
3896 biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
3897 biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
3898 biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
3899
3900 output_data += depth;
3901 }
3902 }
3903 input_data_base += workspace_height_stride;
3904 output_data_base += output_height_stride;
3905 }
3906 }
3907 input_data_depthwise += depth_micro_stride;
3908 output_data_depthwise += 8;
3909 }
3910 } // NOLINT(readability/fn_size) Manually unrolled.
3911
3912 static inline void Run(const int8* scratch_block_data,
3913 const int8* filter_workspace, const int32* bias_data,
3914 uint8* output_block_data,
3915 const DepthwiseConvDotProdParams* function_params) {
3916 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
3917 output_block_data, function_params);
3918 }
3919 };
3920
3921 template <>
3922 struct KernelMacroBlock<
3923 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3924 QuantizationType::kNonPerChannelUint8,
3925 DepthwiseConvDepthMultiplication::kNoMultiplication,
3926 /*stride=*/2> {
3927 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3928 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3929 return vmin_u8(a, b);
3930 }
3931 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3932 return vmax_u8(a, b);
3933 }
3934
3935 static inline void KernelMacroBlockIntrinsics(
3936 const int8* scratch_block_data, const int8* filter_workspace,
3937 const int32* bias_data, uint8* output_block_data,
3938 const DepthwiseConvDotProdParams* function_params) {
3939 static constexpr QuantizationType quantization_type =
3940 QuantizationType::kNonPerChannelUint8;
3941
3942 const int workspace_height_stride =
3943 function_params->workspace_height_stride;
3944 const int input_width_overall_micro_repeats =
3945 function_params->input_width_overall_micro_repeats;
3946 const int output_width_micro_repeats =
3947 function_params->output_width_micro_repeats;
3948 const int depth_micro_repeats = function_params->depth_micro_repeats;
3949 const int depth = function_params->input_depth;
3950 constexpr int kStrideVal = 2;
3951 constexpr int kFourOverStride = 2;
3952 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
3953 TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
3954
3955 const int workspace_width_micro_repeats =
3956 function_params->workspace_width_micro_repeats;
3957 const int output_width_overall_micro_repeats =
3958 function_params->output_width_overall_micro_repeats;
3959 const int block_height = function_params->outbound_block_height;
3960 const int residual_width = function_params->output_residual_width;
3961 const int output_height_stride = function_params->output_height_stride;
3962 constexpr int kBiasIncrement = 4;
3963
3964 TFLITE_DCHECK(depth_micro_repeats > 0);
3965 const int width_micro_stride = 4 * 8;
3966 const int depth_micro_stride =
3967 width_micro_stride * input_width_overall_micro_repeats;
3968
3969 const int32 output_activation_min =
3970 function_params->quantized_activation_min;
3971 const int32 output_activation_max =
3972 function_params->quantized_activation_max;
3973 const int32 output_multiplier = function_params->output_multiplier;
3974 const int32 output_shift = function_params->output_shift;
3975 const int32 output_offset = function_params->output_offset;
3976 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3977 TFLITE_DCHECK_GE(output_activation_min, 0);
3978 TFLITE_DCHECK_LT(output_activation_min, 256);
3979 TFLITE_DCHECK_GE(output_activation_max, 0);
3980 TFLITE_DCHECK_LT(output_activation_max, 256);
3981 } else {
3982 TFLITE_DCHECK_GE(output_activation_min, -128);
3983 TFLITE_DCHECK_LT(output_activation_min, 128);
3984 TFLITE_DCHECK_GE(output_activation_max, -128);
3985 TFLITE_DCHECK_LT(output_activation_max, 128);
3986 }
3987 TFLITE_DCHECK_GE(output_offset, -32878);
3988 TFLITE_DCHECK_LT(output_offset, 32768);
3989
3990 // This version only does min/max on 64 bits.
3991 const int16x8_t output_offset_vec =
3992 vdupq_n_s16(static_cast<int16>(output_offset));
3993 const uint8x8_t output_activation_min_vec =
3994 vdup_n_u8(static_cast<uint8>(output_activation_min));
3995 const uint8x8_t output_activation_max_vec =
3996 vdup_n_u8(static_cast<uint8>(output_activation_max));
3997
3998 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3999
4000 TFLITE_DCHECK_LE(block_height, 2);
4001
4002 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4003 const int8* filter_block =
4004 filter_workspace + shuffled_filter_increment * j_depth;
4005
4006 if (block_height == 2) {
4007 for (int s = 0; s < 2; ++s) {
4008 // Simulate NEON-register transposition of subset of filter.
4009 int8x16_t filter_reg_0_a;
4010 int8x16_t filter_reg_1_a;
4011 int8x16_t filter_reg_2_a;
4012
4013 filter_reg_0_a = vld1q_s8(filter_block + s * 16);
4014 filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
4015 filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
4016
4017 const int8* scratch_data =
4018 scratch_block_data + depth_micro_stride * j_depth;
4019 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4020 output_data = output_block_data + 8 * j_depth;
4021 const int8* input_data_0 = scratch_data + s * 2 * 8;
4022
4023 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4024
4025 // Load first sub-micro block of data into operational banks.
4026 int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
4027 int8x16_t left_bank_1_reg =
4028 vld1q_s8(input_data_0 + workspace_height_stride);
4029 int8x16_t left_bank_2_reg =
4030 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4031 int8x16_t left_bank_3_reg =
4032 vld1q_s8(input_data_0 + 3 * workspace_height_stride);
4033 int8x16_t left_bank_4_reg =
4034 vld1q_s8(input_data_0 + 4 * workspace_height_stride);
4035
4036 int8x16_t right_bank_0_reg;
4037 int8x16_t right_bank_1_reg;
4038 int8x16_t right_bank_2_reg;
4039 int8x16_t right_bank_3_reg;
4040 int8x16_t right_bank_4_reg;
4041
4042 int32x4_t acc0;
4043 int32x4_t acc1;
4044 int16x8_t acc_s16_0_1;
4045 uint8x8_t acc_u8;
4046
4047 int i_width = 0;
4048
4049 // When output_width_micro_repeats <
4050 // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
4051 // residual_width == 1 is then true iff residual_width < 2.
4052 const int adjusted_width_micro_repeats =
4053 (output_width_micro_repeats <
4054 output_width_overall_micro_repeats) &&
4055 (residual_width == 1)
4056 ? output_width_micro_repeats
4057 : output_width_overall_micro_repeats;
4058
4059 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4060 const int output_width = kFourOverStride;
4061 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4062 const int8* input_data =
4063 input_data_0 + width_micro_stride * i_width;
4064 acc0 = adjusted_bias_data;
4065 acc1 = adjusted_bias_data;
4066 right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
4067 right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
4068 workspace_height_stride);
4069
4070 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4071 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4072 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4073 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4074
4075 right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
4076 2 * workspace_height_stride);
4077 right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
4078 3 * workspace_height_stride);
4079 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4080 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4081 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4082 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4083 right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
4084 4 * workspace_height_stride);
4085
4086 // Fixed-point multiplication.
4087 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4088 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4089 acc0, -output_shift);
4090 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4091 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4092 acc1, -output_shift);
4093 // Add the output offset.
4094 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4095 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4096 // Apply the activation function.
4097 acc_u8 = vqmovxn_s16(acc_s16_0_1);
4098 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4099 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4100
4101 left_bank_0_reg = vreinterpretq_s8_u16(
4102 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
4103 left_bank_1_reg = vreinterpretq_s8_u16(
4104 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
4105 left_bank_2_reg = vreinterpretq_s8_u16(
4106 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
4107 left_bank_3_reg = vreinterpretq_s8_u16(
4108 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
4109 left_bank_4_reg = vreinterpretq_s8_u16(
4110 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
4111 acc0 = adjusted_bias_data;
4112 acc1 = adjusted_bias_data;
4113 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4114 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4115 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4116 vst1_lane_u8x4(output_data_base, acc_u8, 0);
4117 vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8, 1);
4118
4119 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4120 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4121
4122 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4123 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4124 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4125 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4126 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4127 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4128
4129 // Fixed-point multiplication.
4130 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4131 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4132 acc0, -output_shift);
4133 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4134 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4135 acc1, -output_shift);
4136 // Add the output offset.
4137 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4138 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4139 // Apply the activation function.
4140 acc_u8 = vqmovxn_s16(acc_s16_0_1);
4141 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4142 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4143
4144 vst1_lane_u8x4(output_data_base + depth, acc_u8, 0);
4145 vst1_lane_u8x4(output_data_base + depth + output_height_stride,
4146 acc_u8, 1);
4147
4148 left_bank_0_reg = right_bank_0_reg;
4149 left_bank_1_reg = right_bank_1_reg;
4150 left_bank_2_reg = right_bank_2_reg;
4151 left_bank_3_reg = right_bank_3_reg;
4152 left_bank_4_reg = right_bank_4_reg;
4153 }
4154 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
4155 TFLITE_DCHECK_NE(residual_width, kFourOverStride);
4156
4157 // No need to load next ("right") block of data.
4158
4159 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4160 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4161
4162 // Iterate over input width shifts within 4x4 blocks.
4163 {
4164 acc0 = adjusted_bias_data;
4165 acc1 = adjusted_bias_data;
4166
4167 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4168 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4169 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4170 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4171 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4172 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4173
4174 // Fixed-point multiplication.
4175 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4176 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4177 acc0, -output_shift);
4178 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4179 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4180 acc1, -output_shift);
4181 // Add the output offset.
4182 int16x8_t acc_s16_0_1 =
4183 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4184 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4185 // Apply the activation function.
4186 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4187 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4188 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4189
4190 vst1_lane_u8x4(output_data_base, acc_u8, 0);
4191 vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8,
4192 1);
4193
4194 left_bank_0_reg = vreinterpretq_s8_u16(
4195 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
4196 left_bank_1_reg = vreinterpretq_s8_u16(
4197 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
4198 left_bank_2_reg = vreinterpretq_s8_u16(
4199 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
4200 left_bank_3_reg = vreinterpretq_s8_u16(
4201 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
4202 left_bank_4_reg = vreinterpretq_s8_u16(
4203 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
4204 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4205 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4206 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4207 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4208 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4209 }
4210 }
4211 bias_data += kBiasIncrement;
4212 }
4213 } else {
4214 // block_height == 1.
4215 int8x16_t filter_reg_0_a;
4216 int8x16_t filter_reg_1_a;
4217 int8x16_t filter_reg_2_a;
4218 int8x16_t filter_reg_0_b;
4219 int8x16_t filter_reg_1_b;
4220 int8x16_t filter_reg_2_b;
4221
4222 filter_reg_0_a = vld1q_s8(filter_block);
4223 filter_reg_1_a = vld1q_s8(filter_block + 32);
4224 filter_reg_2_a = vld1q_s8(filter_block + 64);
4225 filter_reg_0_b = vld1q_s8(filter_block + 16);
4226 filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
4227 filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
4228
4229 const int8* scratch_data =
4230 scratch_block_data + depth_micro_stride * j_depth;
4231 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4232 output_data = output_block_data + 8 * j_depth;
4233 const int8* input_data_0 = scratch_data;
4234
4235 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4236 bias_data += kBiasIncrement;
4237 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4238 bias_data += kBiasIncrement;
4239
4240 // Load first sub-micro block of data into operational banks.
4241 int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
4242 int8x16_t left_bank_1_reg_a =
4243 vld1q_s8(input_data_0 + workspace_height_stride);
4244 int8x16_t left_bank_2_reg_a =
4245 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4246 int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
4247 int8x16_t left_bank_1_reg_b =
4248 vld1q_s8(input_data_0 + workspace_height_stride + 16);
4249 int8x16_t left_bank_2_reg_b =
4250 vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
4251
4252 int8x16_t right_bank_0_reg_a;
4253 int8x16_t right_bank_1_reg_a;
4254 int8x16_t right_bank_2_reg_a;
4255 int8x16_t right_bank_0_reg_b;
4256 int8x16_t right_bank_1_reg_b;
4257 int8x16_t right_bank_2_reg_b;
4258
4259 int32x4_t acc0_a;
4260 int32x4_t acc0_b;
4261
4262 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4263 ++i_width) {
4264 const int output_width = i_width == output_width_micro_repeats
4265 ? residual_width
4266 : kFourOverStride;
4267 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4268 const int8* input_data = input_data_0 + width_micro_stride * i_width;
4269 const bool no_right_block = i_width == output_width_micro_repeats &&
4270 output_width_overall_micro_repeats ==
4271 workspace_width_micro_repeats;
4272
4273 if (!no_right_block) {
4274 // Load next sub-micro block of data.
4275 right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
4276 right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
4277 workspace_height_stride);
4278 right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
4279 2 * workspace_height_stride);
4280 right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
4281 right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
4282 workspace_height_stride + 16);
4283 right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
4284 2 * workspace_height_stride + 16);
4285 }
4286
4287 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4288 output_data_base = output_data + depth * 2 * i_width;
4289
4290 // Iterate over input width shifts within 4x4 blocks.
4291 {
4292 acc0_a = adjusted_bias_data_a;
4293 acc0_b = adjusted_bias_data_b;
4294
4295 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4296 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4297 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4298 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4299 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4300 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4301
4302 // Fixed-point multiplication.
4303 acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4304 acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4305 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4306 acc0_a, -output_shift);
4307 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4308 acc0_b, -output_shift);
4309 // Add the output offset.
4310 int16x8_t acc_s16_0_1 =
4311 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4312 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4313 // Apply the activation function.
4314 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4315 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4316 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4317
4318 util_vst1_u8(output_data_base, acc_u8);
4319
4320 left_bank_0_reg_a = vreinterpretq_s8_u16(
4321 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
4322 left_bank_1_reg_a = vreinterpretq_s8_u16(
4323 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
4324 left_bank_2_reg_a = vreinterpretq_s8_u16(
4325 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
4326 left_bank_0_reg_b = vreinterpretq_s8_u16(
4327 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
4328 left_bank_1_reg_b = vreinterpretq_s8_u16(
4329 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
4330 left_bank_2_reg_b = vreinterpretq_s8_u16(
4331 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
4332 vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
4333 vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
4334 vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
4335 vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
4336 vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
4337 vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
4338 }
4339
4340 if (output_width > 1) {
4341 acc0_a = adjusted_bias_data_a;
4342 acc0_b = adjusted_bias_data_b;
4343
4344 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4345 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4346 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4347 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4348 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4349 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4350
4351 // Fixed-point multiplication.
4352 acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4353 acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4354 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4355 acc0_a, -output_shift);
4356 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4357 acc0_b, -output_shift);
4358 // Add the output offset.
4359 int16x8_t acc_s16_0_1 =
4360 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4361 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4362 // Apply the activation function.
4363 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4364 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4365 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4366
4367 util_vst1_u8(output_data_base + depth, acc_u8);
4368
4369 left_bank_0_reg_a = right_bank_0_reg_a;
4370 left_bank_1_reg_a = right_bank_1_reg_a;
4371 left_bank_2_reg_a = right_bank_2_reg_a;
4372 left_bank_0_reg_b = right_bank_0_reg_b;
4373 left_bank_1_reg_b = right_bank_1_reg_b;
4374 left_bank_2_reg_b = right_bank_2_reg_b;
4375 }
4376 }
4377 }
4378 }
4379 } // NOLINT(readability/fn_size) Manually unrolled.
4380
4381 static inline void Run(const int8* scratch_block_data,
4382 const int8* filter_workspace, const int32* bias_data,
4383 uint8* output_block_data,
4384 const DepthwiseConvDotProdParams* function_params) {
4385 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
4386 output_block_data, function_params);
4387 }
4388 };
4389
4390 template <>
4391 struct KernelMacroBlock<
4392 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
4393 QuantizationType::kNonPerChannelUint8,
4394 DepthwiseConvDepthMultiplication::kUnitInputDepth,
4395 /*stride=*/1> {
4396 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
4397 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
4398 return vmin_u8(a, b);
4399 }
4400 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
4401 return vmax_u8(a, b);
4402 }
4403 static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
4404 return vminq_u8(a, b);
4405 }
4406 static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
4407 return vmaxq_u8(a, b);
4408 }
4409
4410 static inline void KernelMacroBlockIntrinsics(
4411 const int8* scratch_block_data, const int8* filter_workspace,
4412 const int32* bias_data, uint8* output_block_data,
4413 const DepthwiseConvDotProdParams* function_params) {
4414 static constexpr QuantizationType quantization_type =
4415 QuantizationType::kNonPerChannelUint8;
4416
4417 TFLITE_DCHECK_EQ(function_params->stride, 1);
4418 const int workspace_height_stride =
4419 function_params->workspace_height_stride;
4420 const int output_width_micro_repeats =
4421 function_params->output_width_micro_repeats;
4422 const int depth_micro_repeats = function_params->depth_micro_repeats;
4423 const int output_depth = function_params->output_depth;
4424
4425 const int output_width_overall_micro_repeats =
4426 function_params->output_width_overall_micro_repeats;
4427 const int block_height = function_params->outbound_block_height;
4428 const int residual_width = function_params->output_residual_width;
4429 const int output_height_stride = function_params->output_height_stride;
4430 constexpr int kBiasIncrement = 4;
4431
4432 TFLITE_DCHECK(depth_micro_repeats > 0);
4433
4434 const int32 output_activation_min =
4435 function_params->quantized_activation_min;
4436 const int32 output_activation_max =
4437 function_params->quantized_activation_max;
4438 const int32 output_multiplier = function_params->output_multiplier;
4439 const int32 output_shift = function_params->output_shift;
4440 const int32 output_offset = function_params->output_offset;
4441 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
4442 TFLITE_DCHECK_GE(output_activation_min, 0);
4443 TFLITE_DCHECK_LT(output_activation_min, 256);
4444 TFLITE_DCHECK_GE(output_activation_max, 0);
4445 TFLITE_DCHECK_LT(output_activation_max, 256);
4446 } else {
4447 TFLITE_DCHECK_GE(output_activation_min, -128);
4448 TFLITE_DCHECK_LT(output_activation_min, 128);
4449 TFLITE_DCHECK_GE(output_activation_max, -128);
4450 TFLITE_DCHECK_LT(output_activation_max, 128);
4451 }
4452 TFLITE_DCHECK_GE(output_offset, -32878);
4453 TFLITE_DCHECK_LT(output_offset, 32768);
4454
4455 const int16x8_t output_offset_vec =
4456 vdupq_n_s16(static_cast<int16>(output_offset));
4457 const uint8x16_t output_activation_min_vec =
4458 vdupq_n_u8(static_cast<uint8>(output_activation_min));
4459 const uint8x16_t output_activation_max_vec =
4460 vdupq_n_u8(static_cast<uint8>(output_activation_max));
4461
4462 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4463 output_data_depthwise = output_block_data;
4464 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4465 // Simulate NEON-register transposition of subset of filter.
4466 int8x16_t filter_reg_0_a;
4467 int8x16_t filter_reg_0_b;
4468 int8x16_t filter_reg_1_a;
4469 int8x16_t filter_reg_1_b;
4470 int8x16_t filter_reg_2_a;
4471 int8x16_t filter_reg_2_b;
4472 int8x16_t filter_reg_0_a_shifted;
4473 int8x16_t filter_reg_1_a_shifted;
4474 int8x16_t filter_reg_2_a_shifted;
4475
4476 filter_reg_0_a = vld1q_s8(filter_workspace);
4477 filter_workspace += 16;
4478 filter_reg_0_b = vld1q_s8(filter_workspace);
4479 filter_workspace += 16;
4480 filter_reg_1_a = vld1q_s8(filter_workspace);
4481 filter_workspace += 16;
4482 filter_reg_1_b = vld1q_s8(filter_workspace);
4483 filter_workspace += 16;
4484 filter_reg_2_a = vld1q_s8(filter_workspace);
4485 filter_workspace += 16;
4486 filter_reg_2_b = vld1q_s8(filter_workspace);
4487 filter_workspace += 16;
4488
4489 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
4490 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
4491 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
4492 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
4493 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
4494 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
4495
4496 // When output_width_micro_repeats < output_width_overall_micro_repeats,
4497 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
4498 // residual_width < 2.
4499 const int adjusted_width_micro_repeats =
4500 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
4501 (residual_width < 4)
4502 ? output_width_micro_repeats
4503 : output_width_overall_micro_repeats;
4504
4505 if (block_height == 4) {
4506 for (int s = 0; s < 2; ++s) {
4507 // Work through one slice, by row, at a time.
4508 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4509 output_data_base = output_data_depthwise + 4 * s;
4510
4511 const int8* next_input_data = scratch_block_data;
4512 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4513 output_data = output_data_base;
4514
4515 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4516 bias_data += kBiasIncrement;
4517
4518 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
4519 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
4520 int8x16_t input_bank_c_reg; // left 4, right 4, left 5, right 5.
4521
4522 // Load first sub-micro block of data into operational banks.
4523 input_bank_a_reg =
4524 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
4525 // uninitialized variable.
4526 input_bank_a_reg = vld1q_lane_8x4(
4527 next_input_data + workspace_height_stride, input_bank_a_reg, 2);
4528 input_bank_b_reg = vld1q_dup_s8x4(
4529 next_input_data +
4530 2 * workspace_height_stride); // Load lane 0, avoiding
4531 // uninitialized variable.
4532 input_bank_b_reg =
4533 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4534 input_bank_b_reg, 2);
4535 input_bank_c_reg = vld1q_dup_s8x4(
4536 next_input_data +
4537 4 * workspace_height_stride); // Load lane 0, avoiding
4538 // uninitialized variable.
4539 input_bank_c_reg =
4540 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4541 input_bank_c_reg, 2);
4542
4543 int32x4_t acc0;
4544 int32x4_t acc1;
4545 int32x4_t acc2;
4546 int32x4_t acc3;
4547
4548 acc0 = adjusted_bias_data;
4549 acc1 = adjusted_bias_data;
4550 acc2 = adjusted_bias_data;
4551 acc3 = adjusted_bias_data;
4552
4553 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
4554 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
4555 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
4556 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
4557
4558 int i_width = 0;
4559 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4560 next_input_data += 4;
4561
4562 // Iterate over input width shifts within 4x4 blocks.
4563 {
4564 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4565 0);
4566 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4567 2);
4568 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4569 2);
4570 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4571 2);
4572 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4573 2);
4574 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4575 0);
4576 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4577 0);
4578 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4579 2);
4580
4581 // Fixed-point multiplication.
4582 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4583 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4584 acc0, -output_shift);
4585 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4586 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4587 acc1, -output_shift);
4588 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4589 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4590 acc2, -output_shift);
4591 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4592 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4593 acc3, -output_shift);
4594 // Add the output offset.
4595 int16x8_t acc_s16_0_1 =
4596 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4597 int16x8_t acc_s16_2_3 =
4598 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4599 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4600 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4601 // Apply the activation function.
4602 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4603 vqmovxn_s16(acc_s16_2_3));
4604 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4605 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4606
4607 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4608 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4609 1);
4610 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4611 acc_u8_all, 2);
4612 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4613 acc_u8_all, 3);
4614
4615 output_data += output_depth;
4616 }
4617 // Load next sub-micro block of data.
4618 input_bank_a_reg =
4619 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4620 input_bank_a_reg = vld1q_lane_8x4(
4621 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4622 input_bank_b_reg =
4623 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4624 input_bank_b_reg, 1);
4625 input_bank_b_reg =
4626 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4627 input_bank_b_reg, 3);
4628 input_bank_c_reg =
4629 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4630 input_bank_c_reg, 1);
4631 input_bank_c_reg =
4632 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4633 input_bank_c_reg, 3);
4634
4635 {
4636 acc0 = adjusted_bias_data;
4637 acc1 = adjusted_bias_data;
4638 acc2 = adjusted_bias_data;
4639 acc3 = adjusted_bias_data;
4640
4641 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4642 input_bank_a_reg, 0);
4643 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4644 input_bank_a_reg, 2);
4645 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4646 input_bank_b_reg, 0);
4647 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4648 input_bank_a_reg, 2);
4649 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4650 input_bank_b_reg, 0);
4651 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4652 input_bank_b_reg, 2);
4653 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4654 input_bank_b_reg, 0);
4655 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4656 input_bank_b_reg, 2);
4657 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4658 input_bank_c_reg, 0);
4659 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4660 input_bank_b_reg, 2);
4661 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4662 input_bank_c_reg, 0);
4663 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4664 input_bank_c_reg, 2);
4665
4666 // Fixed-point multiplication.
4667 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4668 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4669 acc0, -output_shift);
4670 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4671 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4672 acc1, -output_shift);
4673 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4674 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4675 acc2, -output_shift);
4676 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4677 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4678 acc3, -output_shift);
4679 // Add the output offset.
4680 int16x8_t acc_s16_0_1 =
4681 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4682 int16x8_t acc_s16_2_3 =
4683 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4684 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4685 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4686 // Apply the activation function.
4687 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4688 vqmovxn_s16(acc_s16_2_3));
4689 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4690 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4691
4692 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4693 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4694 1);
4695 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4696 acc_u8_all, 2);
4697 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4698 acc_u8_all, 3);
4699
4700 input_bank_a_reg = vreinterpretq_s8_u64(
4701 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
4702 input_bank_b_reg = vreinterpretq_s8_u64(
4703 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
4704 input_bank_c_reg = vreinterpretq_s8_u64(
4705 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
4706
4707 output_data += output_depth;
4708 }
4709
4710 {
4711 acc0 = adjusted_bias_data;
4712 acc1 = adjusted_bias_data;
4713 acc2 = adjusted_bias_data;
4714 acc3 = adjusted_bias_data;
4715
4716 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4717 0);
4718 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4719 2);
4720 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4721 0);
4722 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4723 2);
4724 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4725 0);
4726 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4727 2);
4728 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4729 0);
4730 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4731 2);
4732 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4733 0);
4734 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4735 2);
4736 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4737 0);
4738 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4739 2);
4740
4741 // Fixed-point multiplication.
4742 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4743 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4744 acc0, -output_shift);
4745 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4746 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4747 acc1, -output_shift);
4748 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4749 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4750 acc2, -output_shift);
4751 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4752 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4753 acc3, -output_shift);
4754 // Add the output offset.
4755 int16x8_t acc_s16_0_1 =
4756 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4757 int16x8_t acc_s16_2_3 =
4758 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4759 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4760 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4761 // Apply the activation function.
4762 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4763 vqmovxn_s16(acc_s16_2_3));
4764 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4765 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4766
4767 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4768 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4769 1);
4770 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4771 acc_u8_all, 2);
4772 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4773 acc_u8_all, 3);
4774
4775 output_data += output_depth;
4776 }
4777
4778 {
4779 acc0 = adjusted_bias_data;
4780 acc1 = adjusted_bias_data;
4781 acc2 = adjusted_bias_data;
4782 acc3 = adjusted_bias_data;
4783
4784 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4785 input_bank_a_reg, 0);
4786 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4787 input_bank_a_reg, 2);
4788 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4789 input_bank_b_reg, 0);
4790 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4791 input_bank_a_reg, 2);
4792 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4793 input_bank_b_reg, 0);
4794 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4795 input_bank_b_reg, 2);
4796 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4797 input_bank_b_reg, 0);
4798 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4799 input_bank_b_reg, 2);
4800 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4801 input_bank_c_reg, 0);
4802 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4803 input_bank_b_reg, 2);
4804 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4805 input_bank_c_reg, 0);
4806 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4807 input_bank_c_reg, 2);
4808
4809 // Fixed-point multiplication.
4810 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4811 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4812 acc0, -output_shift);
4813 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4814 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4815 acc1, -output_shift);
4816 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4817 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4818 acc2, -output_shift);
4819 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4820 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4821 acc3, -output_shift);
4822 // Add the output offset.
4823 int16x8_t acc_s16_0_1 =
4824 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4825 int16x8_t acc_s16_2_3 =
4826 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4827 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4828 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4829 // Apply the activation function.
4830 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4831 vqmovxn_s16(acc_s16_2_3));
4832 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4833 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4834
4835 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4836 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4837 1);
4838 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4839 acc_u8_all, 2);
4840 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4841 acc_u8_all, 3);
4842
4843 input_bank_a_reg = vreinterpretq_s8_u64(
4844 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
4845 input_bank_b_reg = vreinterpretq_s8_u64(
4846 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
4847 input_bank_c_reg = vreinterpretq_s8_u64(
4848 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
4849
4850 output_data += output_depth;
4851 acc0 = adjusted_bias_data;
4852 acc1 = adjusted_bias_data;
4853 acc2 = adjusted_bias_data;
4854 acc3 = adjusted_bias_data;
4855
4856 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4857 0);
4858 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4859 0);
4860 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4861 0);
4862 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4863 2);
4864 }
4865 }
4866
4867 if (i_width < output_width_overall_micro_repeats) {
4868 next_input_data += 4;
4869 const int output_width = residual_width;
4870
4871 // Load next sub-micro block of data.
4872 input_bank_a_reg =
4873 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4874 input_bank_a_reg = vld1q_lane_8x4(
4875 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4876 input_bank_b_reg =
4877 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4878 input_bank_b_reg, 1);
4879 input_bank_b_reg =
4880 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4881 input_bank_b_reg, 3);
4882 input_bank_c_reg =
4883 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4884 input_bank_c_reg, 1);
4885 input_bank_c_reg =
4886 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4887 input_bank_c_reg, 3);
4888
4889 // Iterate over input width shifts within 4x4 blocks.
4890 for (int x = 0; x < output_width; ++x) {
4891 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4892 0);
4893 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4894 2);
4895 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4896 2);
4897 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4898 2);
4899 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4900 2);
4901 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4902 0);
4903 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4904 0);
4905 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4906 2);
4907
4908 // Fixed-point multiplication.
4909 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4910 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4911 acc0, -output_shift);
4912 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4913 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4914 acc1, -output_shift);
4915 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4916 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4917 acc2, -output_shift);
4918 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4919 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4920 acc3, -output_shift);
4921 // Add the output offset.
4922 int16x8_t acc_s16_0_1 =
4923 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4924 int16x8_t acc_s16_2_3 =
4925 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4926 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4927 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4928 // Apply the activation function.
4929 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4930 vqmovxn_s16(acc_s16_2_3));
4931 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4932 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4933
4934 vst1q_lane_u8x4(output_data, acc_u8_all, 0);
4935 vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
4936 1);
4937 vst1q_lane_u8x4(output_data + 2 * output_height_stride,
4938 acc_u8_all, 2);
4939 vst1q_lane_u8x4(output_data + 3 * output_height_stride,
4940 acc_u8_all, 3);
4941
4942 input_bank_a_reg = vreinterpretq_s8_u64(
4943 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
4944 input_bank_b_reg = vreinterpretq_s8_u64(
4945 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
4946 input_bank_c_reg = vreinterpretq_s8_u64(
4947 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
4948
4949 output_data += output_depth;
4950
4951 acc0 = adjusted_bias_data;
4952 acc1 = adjusted_bias_data;
4953 acc2 = adjusted_bias_data;
4954 acc3 = adjusted_bias_data;
4955
4956 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4957 0);
4958 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4959 0);
4960 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4961 0);
4962 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4963 2);
4964 }
4965 }
4966 // scratch_block_data += 4 * workspace_height_stride;
4967 output_data_base += 4 * output_height_stride;
4968
4969 // Move to next sub-block: advance to second set of filters, to new
4970 // bias.
4971 filter_reg_0_a = filter_reg_0_b;
4972 filter_reg_1_a = filter_reg_1_b;
4973 filter_reg_2_a = filter_reg_2_b;
4974 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
4975 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
4976 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
4977 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
4978 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
4979 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
4980 }
4981 } else {
4982 // Block height < 4.
4983 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4984 output_data_base = output_data_depthwise;
4985
4986 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4987 bias_data += kBiasIncrement;
4988 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4989 bias_data += kBiasIncrement;
4990
4991 for (int k_height = 0; k_height < block_height; ++k_height) {
4992 const int8* next_input_data =
4993 scratch_block_data + k_height * workspace_height_stride;
4994 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4995 output_data = output_data_base;
4996
4997 int8x16_t input_bank_p_reg; // left 0, right 0, left 1, right 1.
4998 int8x16_t input_bank_q_reg; // left 2, right 2, left 3, right 3.
4999
5000 // Load first sub-micro block of data into operational banks.
5001 input_bank_p_reg =
5002 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
5003 // uninitialized variable.
5004 input_bank_p_reg = vld1q_lane_8x4(
5005 next_input_data + workspace_height_stride, input_bank_p_reg, 2);
5006 input_bank_q_reg = vld1q_dup_s8x4(
5007 next_input_data +
5008 2 * workspace_height_stride); // Load lane 0, avoiding
5009 // uninitialized variable.
5010
5011 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5012 ++i_width) {
5013 next_input_data += 4;
5014 const int output_width =
5015 i_width == output_width_micro_repeats ? residual_width : 4;
5016
5017 // Load next sub-micro block of data.
5018 input_bank_p_reg =
5019 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
5020 input_bank_p_reg = vld1q_lane_8x4(
5021 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
5022 input_bank_q_reg =
5023 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
5024 input_bank_q_reg, 1);
5025 // Iterate over input width shifts within 4x4 blocks.
5026 for (int x = 0; x < output_width; ++x) {
5027 int32x4_t acc_a = adjusted_bias_data_a;
5028 int32x4_t acc_b = adjusted_bias_data_b;
5029 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
5030 input_bank_p_reg, 0);
5031 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
5032 input_bank_p_reg, 2);
5033 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
5034 input_bank_q_reg, 0);
5035 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
5036 input_bank_p_reg, 0);
5037 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
5038 input_bank_p_reg, 2);
5039 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
5040 input_bank_q_reg, 0);
5041
5042 // Fixed-point multiplication.
5043 acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
5044 acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
5045 acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5046 acc_a, -output_shift);
5047 acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5048 acc_b, -output_shift);
5049 // Add the output offset.
5050 int16x8_t acc_s16_0_0 =
5051 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
5052 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
5053 // Apply the activation function.
5054 uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
5055 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
5056 vget_low_u8(output_activation_min_vec));
5057 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
5058 vget_low_u8(output_activation_max_vec));
5059
5060 util_vst1_u8(output_data, acc_u8_0_0);
5061
5062 input_bank_p_reg = vreinterpretq_s8_u64(
5063 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
5064 input_bank_q_reg = vreinterpretq_s8_u64(
5065 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
5066
5067 output_data += output_depth;
5068 }
5069 }
5070 output_data_base += output_height_stride;
5071 }
5072 }
5073 output_data_depthwise += 8;
5074 }
5075 } // NOLINT(readability/fn_size) Manually unrolled.
5076
5077 static inline void Run(const int8* scratch_block_data,
5078 const int8* filter_workspace, const int32* bias_data,
5079 uint8* output_block_data,
5080 const DepthwiseConvDotProdParams* function_params) {
5081 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5082 output_block_data, function_params);
5083 }
5084 };
5085
5086 template <>
5087 struct KernelMacroBlock<
5088 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5089 QuantizationType::kNonPerChannelUint8,
5090 DepthwiseConvDepthMultiplication::kUnitInputDepth,
5091 /*stride=*/2> {
5092 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
5093 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
5094 return vmin_u8(a, b);
5095 }
5096 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
5097 return vmax_u8(a, b);
5098 }
5099
5100 static inline void KernelMacroBlockIntrinsics(
5101 const int8* scratch_block_data, const int8* filter_workspace,
5102 const int32* bias_data, uint8* output_block_data,
5103 const DepthwiseConvDotProdParams* function_params) {
5104 static constexpr QuantizationType quantization_type =
5105 QuantizationType::kNonPerChannelUint8;
5106
5107 const int workspace_height_stride =
5108 function_params->workspace_height_stride;
5109 const int output_width_micro_repeats =
5110 function_params->output_width_micro_repeats;
5111 const int depth_micro_repeats = function_params->depth_micro_repeats;
5112 const int output_depth = function_params->output_depth;
5113 constexpr int kStrideVal = 2;
5114 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
5115
5116 const int output_width_overall_micro_repeats =
5117 function_params->output_width_overall_micro_repeats;
5118 const int block_height = function_params->outbound_block_height;
5119 const int residual_width = function_params->output_residual_width;
5120 const int output_height_stride = function_params->output_height_stride;
5121 constexpr int kBiasIncrement = 4;
5122
5123 const int32 output_activation_min =
5124 function_params->quantized_activation_min;
5125 const int32 output_activation_max =
5126 function_params->quantized_activation_max;
5127 const int32 output_multiplier = function_params->output_multiplier;
5128 const int32 output_shift = function_params->output_shift;
5129 const int32 output_offset = function_params->output_offset;
5130 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5131 TFLITE_DCHECK_GE(output_activation_min, 0);
5132 TFLITE_DCHECK_LT(output_activation_min, 256);
5133 TFLITE_DCHECK_GE(output_activation_max, 0);
5134 TFLITE_DCHECK_LT(output_activation_max, 256);
5135 } else {
5136 TFLITE_DCHECK_GE(output_activation_min, -128);
5137 TFLITE_DCHECK_LT(output_activation_min, 128);
5138 TFLITE_DCHECK_GE(output_activation_max, -128);
5139 TFLITE_DCHECK_LT(output_activation_max, 128);
5140 }
5141 TFLITE_DCHECK_GE(output_offset, -32878);
5142 TFLITE_DCHECK_LT(output_offset, 32768);
5143
5144 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
5145
5146 const int16x8_t output_offset_vec =
5147 vdupq_n_s16(static_cast<int16>(output_offset));
5148 const uint8x16_t output_activation_min_vec =
5149 vdupq_n_u8(static_cast<uint8>(output_activation_min));
5150 const uint8x16_t output_activation_max_vec =
5151 vdupq_n_u8(static_cast<uint8>(output_activation_max));
5152
5153 for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
5154 int8x16_t filter_reg_0_a;
5155 int8x16_t filter_reg_0_b;
5156 int8x16_t filter_reg_1_a;
5157 int8x16_t filter_reg_1_b;
5158 int8x16_t filter_reg_2_a;
5159 int8x16_t filter_reg_2_b;
5160
5161 filter_reg_0_a = vld1q_s8(filter_workspace);
5162 filter_workspace += 16;
5163 filter_reg_0_b = vld1q_s8(filter_workspace);
5164 filter_workspace += 16;
5165 filter_reg_1_a = vld1q_s8(filter_workspace);
5166 filter_workspace += 16;
5167 filter_reg_1_b = vld1q_s8(filter_workspace);
5168 filter_workspace += 16;
5169 filter_reg_2_a = vld1q_s8(filter_workspace);
5170 filter_workspace += 16;
5171 filter_reg_2_b = vld1q_s8(filter_workspace);
5172 filter_workspace += 16;
5173
5174 const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
5175 bias_data += kBiasIncrement;
5176 const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
5177 bias_data += kBiasIncrement;
5178
5179 if (block_height == 2) {
5180 const int8* scratch_data = scratch_block_data;
5181 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5182 output_data = output_block_data + 8 * j_depth;
5183
5184 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
5185 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
5186 int8x16_t input_bank_c_reg; // left 4, right 4, xxx, xxx.
5187
5188 // Load first sub-micro block of data into operational banks.
5189 input_bank_a_reg =
5190 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
5191 // uninitialized variable.
5192 input_bank_a_reg = vld1q_lane_8x4(
5193 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5194 input_bank_b_reg = vld1q_dup_s8x4(
5195 scratch_data +
5196 2 * workspace_height_stride); // Load lane 0, avoiding
5197 // uninitialized variable.
5198 input_bank_b_reg = vld1q_lane_8x4(
5199 scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
5200 input_bank_c_reg = vld1q_dup_s8x4(
5201 scratch_data +
5202 4 * workspace_height_stride); // Load lane 0, avoiding
5203 // uninitialized variable.
5204
5205 int32x4_t acc0;
5206 int32x4_t acc1;
5207
5208 // When output_width_micro_repeats < output_width_overall_micro_repeats,
5209 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
5210 // residual_width < 2.
5211 const int adjusted_width_micro_repeats =
5212 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
5213 (residual_width < 2)
5214 ? output_width_micro_repeats
5215 : output_width_overall_micro_repeats;
5216
5217 int i_width = 0;
5218 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
5219 const int8* input_data = scratch_data + 4 + 4 * i_width;
5220
5221 // Load next sub-micro block of data.
5222 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5223 input_bank_a_reg = vld1q_lane_8x4(
5224 input_data + workspace_height_stride, input_bank_a_reg, 3);
5225 input_bank_b_reg = vld1q_lane_8x4(
5226 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5227 input_bank_b_reg = vld1q_lane_8x4(
5228 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5229 input_bank_c_reg = vld1q_lane_8x4(
5230 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5231
5232 int16x8_t acc_s16_0_1;
5233 uint8x8_t acc_u8_0_1;
5234 // Iterate over input width shifts within 4x4 blocks.
5235 {
5236 acc0 = adjusted_bias_data_s_0;
5237 acc1 = adjusted_bias_data_s_0;
5238
5239 acc0 =
5240 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5241 acc0 =
5242 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5243 acc0 =
5244 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5245 acc1 =
5246 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5247 acc1 =
5248 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5249 acc1 =
5250 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5251
5252 // Fixed-point multiplication.
5253 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5254 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5255 acc0, -output_shift);
5256 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5257 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5258 acc1, -output_shift);
5259 // Add the output offset.
5260 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5261 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5262 // Apply the activation function.
5263 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5264 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5265 vget_low_u8(output_activation_min_vec));
5266 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5267 vget_low_u8(output_activation_max_vec));
5268
5269 vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5270 vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5271
5272 acc0 = adjusted_bias_data_s_1;
5273 acc1 = adjusted_bias_data_s_1;
5274
5275 acc0 =
5276 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5277 acc0 =
5278 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5279 acc0 =
5280 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5281 acc1 =
5282 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5283 acc1 =
5284 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5285 acc1 =
5286 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5287
5288 // Fixed-point multiplication.
5289 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5290 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5291 acc0, -output_shift);
5292 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5293 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5294 acc1, -output_shift);
5295 // Add the output offset.
5296 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5297 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5298 // Apply the activation function.
5299 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5300 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5301 vget_low_u8(output_activation_min_vec));
5302 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5303 vget_low_u8(output_activation_max_vec));
5304
5305 vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5306 vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5307 1);
5308
5309 input_bank_a_reg = vreinterpretq_s8_u64(
5310 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5311 input_bank_b_reg = vreinterpretq_s8_u64(
5312 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5313 input_bank_c_reg = vreinterpretq_s8_u64(
5314 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
5315
5316 output_data += output_depth;
5317 }
5318
5319 // output_width == four_over_stride.
5320 acc0 = adjusted_bias_data_s_0;
5321 acc1 = adjusted_bias_data_s_0;
5322
5323 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5324 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5325 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5326 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5327 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5328 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5329
5330 // Fixed-point multiplication.
5331 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5332 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5333 acc0, -output_shift);
5334 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5335 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5336 acc1, -output_shift);
5337 // Add the output offset.
5338 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5339 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5340 // Apply the activation function.
5341 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5342 acc_u8_0_1 =
5343 util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5344 acc_u8_0_1 =
5345 util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5346
5347 vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5348 vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5349
5350 acc0 = adjusted_bias_data_s_1;
5351 acc1 = adjusted_bias_data_s_1;
5352
5353 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5354 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5355 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5356 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5357 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5358 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5359
5360 // Fixed-point multiplication.
5361 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5362 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5363 acc0, -output_shift);
5364 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5365 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5366 acc1, -output_shift);
5367 // Add the output offset.
5368 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5369 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5370 // Apply the activation function.
5371 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5372 acc_u8_0_1 =
5373 util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5374 acc_u8_0_1 =
5375 util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5376
5377 vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5378 vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
5379
5380 input_bank_a_reg = vreinterpretq_s8_u64(
5381 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
5382 input_bank_b_reg = vreinterpretq_s8_u64(
5383 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
5384 input_bank_c_reg = vreinterpretq_s8_u64(
5385 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
5386
5387 output_data += output_depth;
5388 }
5389 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
5390 // output_width == 1.
5391 const int8* input_data = scratch_data + 4 + 4 * i_width;
5392
5393 // Load next sub-micro block of data.
5394 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5395 input_bank_a_reg = vld1q_lane_8x4(
5396 input_data + workspace_height_stride, input_bank_a_reg, 3);
5397 input_bank_b_reg = vld1q_lane_8x4(
5398 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5399 input_bank_b_reg = vld1q_lane_8x4(
5400 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5401 input_bank_c_reg = vld1q_lane_8x4(
5402 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5403
5404 int16x8_t acc_s16_0_1;
5405 uint8x8_t acc_u8_0_1;
5406 // Iterate over input width shifts within 4x4 blocks.
5407 {
5408 acc0 = adjusted_bias_data_s_0;
5409 acc1 = adjusted_bias_data_s_0;
5410
5411 acc0 =
5412 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5413 acc0 =
5414 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5415 acc0 =
5416 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5417 acc1 =
5418 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5419 acc1 =
5420 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5421 acc1 =
5422 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5423
5424 // Fixed-point multiplication.
5425 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5426 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5427 acc0, -output_shift);
5428 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5429 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5430 acc1, -output_shift);
5431 // Add the output offset.
5432 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5433 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5434 // Apply the activation function.
5435 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5436 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5437 vget_low_u8(output_activation_min_vec));
5438 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5439 vget_low_u8(output_activation_max_vec));
5440
5441 vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
5442 vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5443
5444 acc0 = adjusted_bias_data_s_1;
5445 acc1 = adjusted_bias_data_s_1;
5446
5447 acc0 =
5448 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5449 acc0 =
5450 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5451 acc0 =
5452 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5453 acc1 =
5454 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5455 acc1 =
5456 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5457 acc1 =
5458 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5459
5460 // Fixed-point multiplication.
5461 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5462 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5463 acc0, -output_shift);
5464 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5465 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5466 acc1, -output_shift);
5467 // Add the output offset.
5468 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5469 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5470 // Apply the activation function.
5471 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5472 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5473 vget_low_u8(output_activation_min_vec));
5474 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5475 vget_low_u8(output_activation_max_vec));
5476
5477 vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
5478 vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5479 1);
5480
5481 input_bank_a_reg = vreinterpretq_s8_u64(
5482 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5483 input_bank_b_reg = vreinterpretq_s8_u64(
5484 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5485 input_bank_c_reg = vreinterpretq_s8_u64(
5486 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
5487
5488 output_data += output_depth;
5489 }
5490 }
5491 } else {
5492 TFLITE_DCHECK_EQ(block_height, 1);
5493 // Work through one slice, by row, at a time.
5494 const int8* scratch_data = scratch_block_data;
5495 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5496 output_data = output_block_data + 8 * j_depth;
5497
5498 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
5499 int8x16_t input_bank_b_reg; // left 2, right 2, xxx, xxx.
5500
5501 // Load first sub-micro block of data into operational banks.
5502 input_bank_a_reg =
5503 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
5504 // uninitialized variable.
5505 input_bank_a_reg = vld1q_lane_8x4(
5506 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5507 input_bank_b_reg = vld1q_dup_s8x4(
5508 scratch_data +
5509 2 * workspace_height_stride); // Load lane 0, avoiding
5510 // uninitialized variable.
5511
5512 int32x4_t acc0;
5513 int32x4_t acc1;
5514
5515 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5516 ++i_width) {
5517 const int output_width =
5518 i_width == output_width_micro_repeats ? residual_width : 2;
5519
5520 TFLITE_DCHECK_LE(output_width, 2);
5521 TFLITE_DCHECK_GE(output_width, 1);
5522 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
5523 const int8* input_data = scratch_data + 4 + 4 * i_width;
5524
5525 // Load next sub-micro block of data.
5526 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5527 input_bank_a_reg = vld1q_lane_8x4(
5528 input_data + workspace_height_stride, input_bank_a_reg, 3);
5529 input_bank_b_reg = vld1q_lane_8x4(
5530 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5531
5532 int16x8_t acc_s16_0_1;
5533 uint8x8_t acc_u8_0_1;
5534
5535 // Iterate over input width shifts within 4x4 blocks.
5536 {
5537 acc0 = adjusted_bias_data_s_0;
5538
5539 acc0 =
5540 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5541 acc0 =
5542 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5543 acc0 =
5544 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5545
5546 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5547 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5548 acc0, -output_shift);
5549
5550 // Second sub-block accumulation.
5551 acc1 = adjusted_bias_data_s_1;
5552
5553 acc1 =
5554 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5555 acc1 =
5556 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5557 acc1 =
5558 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5559
5560 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5561 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5562 acc1, -output_shift);
5563
5564 // Add the output offset.
5565 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5566 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5567 // Apply the activation function.
5568 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5569 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5570 vget_low_u8(output_activation_min_vec));
5571 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5572 vget_low_u8(output_activation_max_vec));
5573
5574 // This stores the results for both sub-blocks together.
5575 util_vst1_u8(output_data, acc_u8_0_1);
5576
5577 input_bank_a_reg = vreinterpretq_s8_u64(
5578 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5579 input_bank_b_reg = vreinterpretq_s8_u64(
5580 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5581
5582 output_data += output_depth;
5583 }
5584 if (output_width == 2) {
5585 acc0 = adjusted_bias_data_s_0;
5586
5587 acc0 =
5588 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5589 acc0 =
5590 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5591 acc0 =
5592 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5593
5594 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5595 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5596 acc0, -output_shift);
5597
5598 // Second sub-block accumulation.
5599 acc1 = adjusted_bias_data_s_1;
5600
5601 acc1 =
5602 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5603 acc1 =
5604 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5605 acc1 =
5606 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5607
5608 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5609 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5610 acc1, -output_shift);
5611
5612 // Add the output offset.
5613 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5614 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5615 // Apply the activation function.
5616 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5617 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5618 vget_low_u8(output_activation_min_vec));
5619 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5620 vget_low_u8(output_activation_max_vec));
5621
5622 // This stores the results for both sub-blocks together.
5623 util_vst1_u8(output_data, acc_u8_0_1);
5624
5625 input_bank_a_reg = vreinterpretq_s8_u64(
5626 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
5627 input_bank_b_reg = vreinterpretq_s8_u64(
5628 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
5629
5630 output_data += output_depth;
5631 }
5632 }
5633 }
5634 }
5635 }
5636
5637 static inline void Run(const int8* scratch_block_data,
5638 const int8* filter_workspace, const int32* bias_data,
5639 uint8* output_block_data,
5640 const DepthwiseConvDotProdParams* function_params) {
5641 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5642 output_block_data, function_params);
5643 }
5644 };
5645
5646 template <>
5647 struct KernelMacroBlock<
5648 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5649 QuantizationType::kPerChannelInt8,
5650 DepthwiseConvDepthMultiplication::kNoMultiplication,
5651 /*stride=*/1> {
5652 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
5653 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
5654 return vmin_s8(a, b);
5655 }
5656 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
5657 return vmax_s8(a, b);
5658 }
5659 static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
5660 return vminq_s8(a, b);
5661 }
5662 static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
5663 return vmaxq_s8(a, b);
5664 }
5665
5666 static inline void KernelMacroBlockIntrinsics(
5667 const int8* scratch_block_data, const int8* filter_workspace,
5668 const int32* bias_data, int8* output_block_data,
5669 const DepthwiseConvDotProdParams* function_params) {
5670 static constexpr QuantizationType quantization_type =
5671 QuantizationType::kPerChannelInt8;
5672
5673 const int workspace_height_stride =
5674 function_params->workspace_height_stride;
5675 const int input_width_overall_micro_repeats =
5676 function_params->input_width_overall_micro_repeats;
5677 const int output_width_micro_repeats =
5678 function_params->output_width_micro_repeats;
5679 const int depth_micro_repeats = function_params->depth_micro_repeats;
5680 const int depth = function_params->input_depth;
5681
5682 const int output_width_overall_micro_repeats =
5683 function_params->output_width_overall_micro_repeats;
5684 const int block_height = function_params->outbound_block_height;
5685 const int residual_width = function_params->output_residual_width;
5686 const int output_height_stride = function_params->output_height_stride;
5687 constexpr int kBiasIncrement = 4;
5688
5689 TFLITE_DCHECK(depth_micro_repeats > 0);
5690 const int width_micro_stride = 4 * 8;
5691 const int depth_micro_stride =
5692 width_micro_stride * input_width_overall_micro_repeats;
5693
5694 const int32 output_activation_min =
5695 function_params->quantized_activation_min;
5696 const int32 output_activation_max =
5697 function_params->quantized_activation_max;
5698 const int32 output_offset = function_params->output_offset;
5699 const int32* output_shift_per_channel =
5700 function_params->output_shift_per_channel;
5701 const int32* output_multiplier_per_channel =
5702 function_params->output_multiplier_per_channel;
5703 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5704 TFLITE_DCHECK_GE(output_activation_min, 0);
5705 TFLITE_DCHECK_LT(output_activation_min, 256);
5706 TFLITE_DCHECK_GE(output_activation_max, 0);
5707 TFLITE_DCHECK_LT(output_activation_max, 256);
5708 } else {
5709 TFLITE_DCHECK_GE(output_activation_min, -128);
5710 TFLITE_DCHECK_LT(output_activation_min, 128);
5711 TFLITE_DCHECK_GE(output_activation_max, -128);
5712 TFLITE_DCHECK_LT(output_activation_max, 128);
5713 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
5714 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
5715 }
5716 TFLITE_DCHECK_GE(output_offset, -32878);
5717 TFLITE_DCHECK_LT(output_offset, 32768);
5718
5719 const int16x8_t output_offset_vec =
5720 vdupq_n_s16(static_cast<int16>(output_offset));
5721 const int8x16_t output_activation_min_vec =
5722 vdupq_n_s8(static_cast<int8>(output_activation_min));
5723 const int8x16_t output_activation_max_vec =
5724 vdupq_n_s8(static_cast<int8>(output_activation_max));
5725
5726 const int8* input_data_depthwise = scratch_block_data;
5727 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5728 output_data_depthwise = output_block_data;
5729 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
5730 // Simulate NEON-register transposition of subset of filter.
5731 int8x16_t filter_reg_0_a;
5732 int8x16_t filter_reg_0_b;
5733 int8x16_t filter_reg_1_a;
5734 int8x16_t filter_reg_1_b;
5735 int8x16_t filter_reg_2_a;
5736 int8x16_t filter_reg_2_b;
5737 int8x16_t filter_reg_0_a_shifted;
5738 int8x16_t filter_reg_1_a_shifted;
5739 int8x16_t filter_reg_2_a_shifted;
5740
5741 filter_reg_0_a = vld1q_s8(filter_workspace);
5742 filter_workspace += 16;
5743 filter_reg_0_b = vld1q_s8(filter_workspace);
5744 filter_workspace += 16;
5745 filter_reg_1_a = vld1q_s8(filter_workspace);
5746 filter_workspace += 16;
5747 filter_reg_1_b = vld1q_s8(filter_workspace);
5748 filter_workspace += 16;
5749 filter_reg_2_a = vld1q_s8(filter_workspace);
5750 filter_workspace += 16;
5751 filter_reg_2_b = vld1q_s8(filter_workspace);
5752 filter_workspace += 16;
5753
5754 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
5755 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
5756 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
5757 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
5758 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
5759 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
5760
5761 if (block_height == 4) {
5762 for (int s = 0; s < 2; ++s) {
5763 // Work through one slice, by row, at a time.
5764 const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
5765 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5766 output_data_base = output_data_depthwise + 4 * s;
5767
5768 const int8* next_input_data = input_data_base;
5769 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5770 output_data = output_data_base;
5771
5772 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
5773 bias_data += kBiasIncrement;
5774
5775 const int32x4_t output_shift =
5776 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
5777 const int32x4_t output_multiplier =
5778 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
5779
5780 // Load first sub-micro block of data into operational banks.
5781 int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
5782 int8x16_t left_bank_1_reg =
5783 vld1q_s8(next_input_data + workspace_height_stride);
5784 int8x16_t left_bank_2_reg =
5785 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5786 int8x16_t left_bank_3_reg =
5787 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5788 int8x16_t left_bank_4_reg =
5789 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5790 int8x16_t left_bank_5_reg =
5791 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5792
5793 int32x4_t acc0;
5794 int32x4_t acc1;
5795 int32x4_t acc2;
5796 int32x4_t acc3;
5797
5798 acc0 = adjusted_bias_data;
5799 acc1 = adjusted_bias_data;
5800 acc2 = adjusted_bias_data;
5801 acc3 = adjusted_bias_data;
5802
5803 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5804 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5805 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5806 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5807
5808 for (int i_width = 0; i_width < output_width_micro_repeats;
5809 ++i_width) {
5810 next_input_data += width_micro_stride;
5811
5812 // Iterate over input width shifts within 4x4 blocks.
5813 {
5814 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5815 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5816 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5817 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5818 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5819 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5820 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5821 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5822
5823 // Fixed-point multiplication.
5824 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5825 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5826 acc0, output_shift);
5827 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5828 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5829 acc1, output_shift);
5830 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5831 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5832 acc2, output_shift);
5833 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5834 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5835 acc3, output_shift);
5836 // Add the output offset.
5837 int16x8_t acc_s16_0_1 =
5838 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5839 int16x8_t acc_s16_2_3 =
5840 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5841 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5842 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5843 // Apply the activation function.
5844 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5845 vqmovxn_s16(acc_s16_2_3));
5846 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5847 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5848
5849 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
5850 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
5851 1);
5852 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
5853 acc_u8_all, 2);
5854 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
5855 acc_u8_all, 3);
5856
5857 output_data += depth;
5858 }
5859
5860 // Load next sub-micro block of data.
5861 int8x16_t right_bank_0_reg;
5862 int8x16_t right_bank_1_reg;
5863 int8x16_t right_bank_2_reg;
5864 int8x16_t right_bank_3_reg;
5865 int8x16_t right_bank_4_reg;
5866 int8x16_t right_bank_5_reg;
5867
5868 // Loading of next block always valid.
5869 right_bank_0_reg = vld1q_s8(next_input_data);
5870 right_bank_1_reg =
5871 vld1q_s8(next_input_data + workspace_height_stride);
5872 right_bank_2_reg =
5873 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5874 right_bank_3_reg =
5875 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5876 right_bank_4_reg =
5877 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5878 right_bank_5_reg =
5879 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5880
5881 {
5882 acc0 = adjusted_bias_data;
5883 acc1 = adjusted_bias_data;
5884 acc2 = adjusted_bias_data;
5885 acc3 = adjusted_bias_data;
5886
5887 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5888 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5889 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5890 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5891 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5892 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5893 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5894 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5895 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5896 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5897 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5898 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5899
5900 // Fixed-point multiplication.
5901 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5902 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5903 acc0, output_shift);
5904 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5905 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5906 acc1, output_shift);
5907 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5908 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5909 acc2, output_shift);
5910 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5911 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5912 acc3, output_shift);
5913 // Add the output offset.
5914 int16x8_t acc_s16_0_1 =
5915 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5916 int16x8_t acc_s16_2_3 =
5917 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5918 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5919 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5920 // Apply the activation function.
5921 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5922 vqmovxn_s16(acc_s16_2_3));
5923 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5924 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5925
5926 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
5927 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
5928 1);
5929 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
5930 acc_u8_all, 2);
5931 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
5932 acc_u8_all, 3);
5933
5934 left_bank_0_reg = vreinterpretq_s8_u16(
5935 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
5936 left_bank_1_reg = vreinterpretq_s8_u16(
5937 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
5938 left_bank_2_reg = vreinterpretq_s8_u16(
5939 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
5940 left_bank_3_reg = vreinterpretq_s8_u16(
5941 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
5942 left_bank_4_reg = vreinterpretq_s8_u16(
5943 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
5944 left_bank_5_reg = vreinterpretq_s8_u16(
5945 vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
5946 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
5947 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
5948 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
5949 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
5950 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
5951 vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
5952
5953 output_data += depth;
5954 }
5955
5956 {
5957 acc0 = adjusted_bias_data;
5958 acc1 = adjusted_bias_data;
5959 acc2 = adjusted_bias_data;
5960 acc3 = adjusted_bias_data;
5961
5962 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5963 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5964 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5965 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5966 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5967 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5968 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5969 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5970 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5971 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5972 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5973 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5974
5975 // Fixed-point multiplication.
5976 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5977 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5978 acc0, output_shift);
5979 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5980 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5981 acc1, output_shift);
5982 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5983 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5984 acc2, output_shift);
5985 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5986 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5987 acc3, output_shift);
5988 // Add the output offset.
5989 int16x8_t acc_s16_0_1 =
5990 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5991 int16x8_t acc_s16_2_3 =
5992 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5993 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5994 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5995 // Apply the activation function.
5996 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
5997 vqmovxn_s16(acc_s16_2_3));
5998 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5999 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6000
6001 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6002 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6003 1);
6004 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6005 acc_u8_all, 2);
6006 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6007 acc_u8_all, 3);
6008
6009 output_data += depth;
6010 }
6011
6012 {
6013 acc0 = adjusted_bias_data;
6014 acc1 = adjusted_bias_data;
6015 acc2 = adjusted_bias_data;
6016 acc3 = adjusted_bias_data;
6017
6018 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
6019 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
6020 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
6021 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
6022 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
6023 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
6024 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
6025 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
6026 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
6027 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
6028 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
6029 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
6030
6031 // Fixed-point multiplication.
6032 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6033 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6034 acc0, output_shift);
6035 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6036 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6037 acc1, output_shift);
6038 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6039 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6040 acc2, output_shift);
6041 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6042 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6043 acc3, output_shift);
6044 // Add the output offset.
6045 int16x8_t acc_s16_0_1 =
6046 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6047 int16x8_t acc_s16_2_3 =
6048 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6049 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6050 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6051 // Apply the activation function.
6052 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
6053 vqmovxn_s16(acc_s16_2_3));
6054 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6055 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6056
6057 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6058 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6059 1);
6060 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6061 acc_u8_all, 2);
6062 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6063 acc_u8_all, 3);
6064
6065 left_bank_0_reg = right_bank_0_reg;
6066 left_bank_1_reg = right_bank_1_reg;
6067 left_bank_2_reg = right_bank_2_reg;
6068 left_bank_3_reg = right_bank_3_reg;
6069 left_bank_4_reg = right_bank_4_reg;
6070 left_bank_5_reg = right_bank_5_reg;
6071
6072 output_data += depth;
6073 acc0 = adjusted_bias_data;
6074 acc1 = adjusted_bias_data;
6075 acc2 = adjusted_bias_data;
6076 acc3 = adjusted_bias_data;
6077
6078 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6079 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6080 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6081 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6082 }
6083 }
6084
6085 if (residual_width > 0) {
6086 next_input_data += width_micro_stride;
6087 const int output_width = residual_width;
6088
6089 // Load next sub-micro block of data.
6090 int8x16_t right_bank_0_reg;
6091 int8x16_t right_bank_1_reg;
6092 int8x16_t right_bank_2_reg;
6093 int8x16_t right_bank_3_reg;
6094 int8x16_t right_bank_4_reg;
6095 int8x16_t right_bank_5_reg;
6096 // Logic: (output_width - 1) * stride_val < 2.
6097 const bool no_right_block = output_width < 3;
6098
6099 if (no_right_block) {
6100 // Only needed for sanitizer checks.
6101 right_bank_0_reg = vdupq_n_s8(0);
6102 right_bank_1_reg = vdupq_n_s8(0);
6103 right_bank_2_reg = vdupq_n_s8(0);
6104 right_bank_3_reg = vdupq_n_s8(0);
6105 right_bank_4_reg = vdupq_n_s8(0);
6106 right_bank_5_reg = vdupq_n_s8(0);
6107 } else {
6108 right_bank_0_reg = vld1q_s8(next_input_data);
6109 right_bank_1_reg =
6110 vld1q_s8(next_input_data + workspace_height_stride);
6111 right_bank_2_reg =
6112 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6113 right_bank_3_reg =
6114 vld1q_s8(next_input_data + 3 * workspace_height_stride);
6115 right_bank_4_reg =
6116 vld1q_s8(next_input_data + 4 * workspace_height_stride);
6117 right_bank_5_reg =
6118 vld1q_s8(next_input_data + 5 * workspace_height_stride);
6119 }
6120
6121 // Iterate over input width shifts within 4x4 blocks.
6122 for (int x = 0; x < output_width; ++x) {
6123 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6124 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6125 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
6126 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
6127 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
6128 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
6129 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
6130 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
6131
6132 // Fixed-point multiplication.
6133 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6134 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6135 acc0, output_shift);
6136 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6137 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6138 acc1, output_shift);
6139 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6140 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6141 acc2, output_shift);
6142 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6143 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6144 acc3, output_shift);
6145 // Add the output offset.
6146 int16x8_t acc_s16_0_1 =
6147 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6148 int16x8_t acc_s16_2_3 =
6149 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6150 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6151 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6152 // Apply the activation function.
6153 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
6154 vqmovxn_s16(acc_s16_2_3));
6155 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6156 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6157
6158 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
6159 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
6160 1);
6161 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
6162 acc_u8_all, 2);
6163 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
6164 acc_u8_all, 3);
6165
6166 biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
6167 biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
6168 biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
6169 biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
6170 biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
6171 biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
6172
6173 output_data += depth;
6174
6175 acc0 = adjusted_bias_data;
6176 acc1 = adjusted_bias_data;
6177 acc2 = adjusted_bias_data;
6178 acc3 = adjusted_bias_data;
6179
6180 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6181 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6182 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6183 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6184 }
6185 }
6186 input_data_base += 4 * workspace_height_stride;
6187 output_data_base += 4 * output_height_stride;
6188
6189 // Move to next sub-block: advance to second set of filters, to new
6190 // bias.
6191 filter_reg_0_a = filter_reg_0_b;
6192 filter_reg_1_a = filter_reg_1_b;
6193 filter_reg_2_a = filter_reg_2_b;
6194 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
6195 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
6196 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
6197 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
6198 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
6199 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
6200 }
6201 } else {
6202 const int8* input_data_base = input_data_depthwise;
6203 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6204 output_data_base = output_data_depthwise;
6205
6206 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6207 bias_data += kBiasIncrement;
6208 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6209 bias_data += kBiasIncrement;
6210
6211 const int32x4_t output_shift_a =
6212 vld1q_s32(output_shift_per_channel + j_depth * 8);
6213 const int32x4_t output_multiplier_a =
6214 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6215 const int32x4_t output_shift_b =
6216 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6217 const int32x4_t output_multiplier_b =
6218 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6219
6220 for (int k_height = 0; k_height < block_height; ++k_height) {
6221 const int8* next_input_data = input_data_base;
6222 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6223 output_data = output_data_base;
6224
6225 // Load first sub-micro block of data into operational banks.
6226 int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
6227 int8x16_t left_bank_1_reg_a =
6228 vld1q_s8(next_input_data + workspace_height_stride);
6229 int8x16_t left_bank_2_reg_a =
6230 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6231 int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6232 int8x16_t left_bank_1_reg_b =
6233 vld1q_s8(next_input_data + workspace_height_stride + 16);
6234 int8x16_t left_bank_2_reg_b =
6235 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6236
6237 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6238 ++i_width) {
6239 next_input_data += width_micro_stride;
6240 const int output_width =
6241 i_width == output_width_micro_repeats ? residual_width : 4;
6242
6243 int8x16_t right_bank_0_reg_a;
6244 int8x16_t right_bank_1_reg_a;
6245 int8x16_t right_bank_2_reg_a;
6246 int8x16_t right_bank_0_reg_b;
6247 int8x16_t right_bank_1_reg_b;
6248 int8x16_t right_bank_2_reg_b;
6249 // Logic: (output_width - 1) * stride_val < 2.
6250 const bool no_right_block = output_width < 3;
6251
6252 // Load next sub-micro block of data.
6253 if (no_right_block) {
6254 // Only needed for sanitizer checks.
6255 right_bank_0_reg_a = vdupq_n_s8(0);
6256 right_bank_1_reg_a = vdupq_n_s8(0);
6257 right_bank_2_reg_a = vdupq_n_s8(0);
6258 right_bank_0_reg_b = vdupq_n_s8(0);
6259 right_bank_1_reg_b = vdupq_n_s8(0);
6260 right_bank_2_reg_b = vdupq_n_s8(0);
6261 } else {
6262 right_bank_0_reg_a = vld1q_s8(next_input_data);
6263 right_bank_1_reg_a =
6264 vld1q_s8(next_input_data + workspace_height_stride);
6265 right_bank_2_reg_a =
6266 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6267 right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6268 right_bank_1_reg_b =
6269 vld1q_s8(next_input_data + workspace_height_stride + 16);
6270 right_bank_2_reg_b =
6271 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6272 }
6273
6274 // Iterate over input width shifts within 4x4 blocks.
6275 for (int x = 0; x < output_width; ++x) {
6276 int32x4_t acc_a = adjusted_bias_data_a;
6277 int32x4_t acc_b = adjusted_bias_data_b;
6278 acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
6279 acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
6280 acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
6281 acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
6282 acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
6283 acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
6284
6285 // Fixed-point multiplication.
6286 acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
6287 acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
6288 acc_a =
6289 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6290 acc_a, output_shift_a);
6291 acc_b =
6292 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6293 acc_b, output_shift_b);
6294 // Add the output offset.
6295 int16x8_t acc_s16_0_0 =
6296 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
6297 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
6298 // Apply the activation function.
6299 int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
6300 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
6301 vget_low_s8(output_activation_min_vec));
6302 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
6303 vget_low_s8(output_activation_max_vec));
6304
6305 vst1_s8(output_data, acc_u8_0_0);
6306
6307 biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
6308 biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
6309 biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
6310 biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
6311 biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
6312 biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
6313
6314 output_data += depth;
6315 }
6316 }
6317 input_data_base += workspace_height_stride;
6318 output_data_base += output_height_stride;
6319 }
6320 }
6321 input_data_depthwise += depth_micro_stride;
6322 output_data_depthwise += 8;
6323 }
6324 } // NOLINT(readability/fn_size) Manually unrolled.
6325
6326 static inline void Run(const int8* scratch_block_data,
6327 const int8* filter_workspace, const int32* bias_data,
6328 int8* output_block_data,
6329 const DepthwiseConvDotProdParams* function_params) {
6330 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6331 output_block_data, function_params);
6332 }
6333 };
6334
6335 template <>
6336 struct KernelMacroBlock<
6337 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6338 QuantizationType::kPerChannelInt8,
6339 DepthwiseConvDepthMultiplication::kNoMultiplication,
6340 /*stride=*/2> {
6341 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6342 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6343 return vmin_s8(a, b);
6344 }
6345 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6346 return vmax_s8(a, b);
6347 }
6348
6349 static inline void KernelMacroBlockIntrinsics(
6350 const int8* scratch_block_data, const int8* filter_workspace,
6351 const int32* bias_data, int8* output_block_data,
6352 const DepthwiseConvDotProdParams* function_params) {
6353 static constexpr QuantizationType quantization_type =
6354 QuantizationType::kPerChannelInt8;
6355
6356 const int workspace_height_stride =
6357 function_params->workspace_height_stride;
6358 const int input_width_overall_micro_repeats =
6359 function_params->input_width_overall_micro_repeats;
6360 const int output_width_micro_repeats =
6361 function_params->output_width_micro_repeats;
6362 const int depth_micro_repeats = function_params->depth_micro_repeats;
6363 const int depth = function_params->input_depth;
6364 constexpr int kStrideVal = 2;
6365 constexpr int kFourOverStride = 2;
6366 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
6367 TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
6368
6369 const int workspace_width_micro_repeats =
6370 function_params->workspace_width_micro_repeats;
6371 const int output_width_overall_micro_repeats =
6372 function_params->output_width_overall_micro_repeats;
6373 const int block_height = function_params->outbound_block_height;
6374 const int residual_width = function_params->output_residual_width;
6375 const int output_height_stride = function_params->output_height_stride;
6376 constexpr int kBiasIncrement = 4;
6377
6378 TFLITE_DCHECK(depth_micro_repeats > 0);
6379 const int width_micro_stride = 4 * 8;
6380 const int depth_micro_stride =
6381 width_micro_stride * input_width_overall_micro_repeats;
6382
6383 const int32 output_activation_min =
6384 function_params->quantized_activation_min;
6385 const int32 output_activation_max =
6386 function_params->quantized_activation_max;
6387 const int32 output_offset = function_params->output_offset;
6388 const int32* output_shift_per_channel =
6389 function_params->output_shift_per_channel;
6390 const int32* output_multiplier_per_channel =
6391 function_params->output_multiplier_per_channel;
6392 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6393 TFLITE_DCHECK_GE(output_activation_min, 0);
6394 TFLITE_DCHECK_LT(output_activation_min, 256);
6395 TFLITE_DCHECK_GE(output_activation_max, 0);
6396 TFLITE_DCHECK_LT(output_activation_max, 256);
6397 } else {
6398 TFLITE_DCHECK_GE(output_activation_min, -128);
6399 TFLITE_DCHECK_LT(output_activation_min, 128);
6400 TFLITE_DCHECK_GE(output_activation_max, -128);
6401 TFLITE_DCHECK_LT(output_activation_max, 128);
6402 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6403 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6404 }
6405 TFLITE_DCHECK_GE(output_offset, -32878);
6406 TFLITE_DCHECK_LT(output_offset, 32768);
6407
6408 // This version only does min/max on 64 bits.
6409 const int16x8_t output_offset_vec =
6410 vdupq_n_s16(static_cast<int16>(output_offset));
6411 const int8x8_t output_activation_min_vec =
6412 vdup_n_s8(static_cast<int8>(output_activation_min));
6413 const int8x8_t output_activation_max_vec =
6414 vdup_n_s8(static_cast<int8>(output_activation_max));
6415
6416 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
6417
6418 TFLITE_DCHECK_LE(block_height, 2);
6419
6420 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6421 const int8* filter_block =
6422 filter_workspace + shuffled_filter_increment * j_depth;
6423
6424 if (block_height == 2) {
6425 for (int s = 0; s < 2; ++s) {
6426 // Simulate NEON-register transposition of subset of filter.
6427 int8x16_t filter_reg_0_a;
6428 int8x16_t filter_reg_1_a;
6429 int8x16_t filter_reg_2_a;
6430
6431 filter_reg_0_a = vld1q_s8(filter_block + s * 16);
6432 filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
6433 filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
6434
6435 const int8* scratch_data =
6436 scratch_block_data + depth_micro_stride * j_depth;
6437 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6438 output_data = output_block_data + 8 * j_depth;
6439 const int8* input_data_0 = scratch_data + s * 2 * 8;
6440
6441 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6442
6443 const int32x4_t output_shift =
6444 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6445 const int32x4_t output_multiplier =
6446 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6447
6448 // Load first sub-micro block of data into operational banks.
6449 int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
6450 int8x16_t left_bank_1_reg =
6451 vld1q_s8(input_data_0 + workspace_height_stride);
6452 int8x16_t left_bank_2_reg =
6453 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6454 int8x16_t left_bank_3_reg =
6455 vld1q_s8(input_data_0 + 3 * workspace_height_stride);
6456 int8x16_t left_bank_4_reg =
6457 vld1q_s8(input_data_0 + 4 * workspace_height_stride);
6458
6459 int8x16_t right_bank_0_reg;
6460 int8x16_t right_bank_1_reg;
6461 int8x16_t right_bank_2_reg;
6462 int8x16_t right_bank_3_reg;
6463 int8x16_t right_bank_4_reg;
6464
6465 int32x4_t acc0;
6466 int32x4_t acc1;
6467 int16x8_t acc_s16_0_1;
6468 int8x8_t acc_u8;
6469
6470 int i_width = 0;
6471
6472 // When output_width_micro_repeats <
6473 // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
6474 // residual_width == 1 is then true iff residual_width < 2.
6475 const int adjusted_width_micro_repeats =
6476 (output_width_micro_repeats <
6477 output_width_overall_micro_repeats) &&
6478 (residual_width == 1)
6479 ? output_width_micro_repeats
6480 : output_width_overall_micro_repeats;
6481
6482 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6483 const int output_width = kFourOverStride;
6484 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6485 const int8* input_data =
6486 input_data_0 + width_micro_stride * i_width;
6487 acc0 = adjusted_bias_data;
6488 acc1 = adjusted_bias_data;
6489 right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
6490 right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
6491 workspace_height_stride);
6492
6493 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6494 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6495 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6496 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6497
6498 right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
6499 2 * workspace_height_stride);
6500 right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
6501 3 * workspace_height_stride);
6502 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6503 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6504 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6505 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6506 right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
6507 4 * workspace_height_stride);
6508
6509 // Fixed-point multiplication.
6510 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6511 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6512 acc0, output_shift);
6513 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6514 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6515 acc1, output_shift);
6516 // Add the output offset.
6517 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6518 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6519 // Apply the activation function.
6520 acc_u8 = vqmovxn_s16(acc_s16_0_1);
6521 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6522 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6523
6524 left_bank_0_reg = vreinterpretq_s8_u16(
6525 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
6526 left_bank_1_reg = vreinterpretq_s8_u16(
6527 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
6528 left_bank_2_reg = vreinterpretq_s8_u16(
6529 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
6530 left_bank_3_reg = vreinterpretq_s8_u16(
6531 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
6532 left_bank_4_reg = vreinterpretq_s8_u16(
6533 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
6534 acc0 = adjusted_bias_data;
6535 acc1 = adjusted_bias_data;
6536 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6537 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6538 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6539 vst1_lane_s8x4(output_data_base, acc_u8, 0);
6540 vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8, 1);
6541
6542 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6543 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6544
6545 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6546 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6547 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6548 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6549 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6550 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6551
6552 // Fixed-point multiplication.
6553 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6554 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6555 acc0, output_shift);
6556 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6557 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6558 acc1, output_shift);
6559 // Add the output offset.
6560 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6561 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6562 // Apply the activation function.
6563 acc_u8 = vqmovxn_s16(acc_s16_0_1);
6564 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6565 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6566
6567 vst1_lane_s8x4(output_data_base + depth, acc_u8, 0);
6568 vst1_lane_s8x4(output_data_base + depth + output_height_stride,
6569 acc_u8, 1);
6570
6571 left_bank_0_reg = right_bank_0_reg;
6572 left_bank_1_reg = right_bank_1_reg;
6573 left_bank_2_reg = right_bank_2_reg;
6574 left_bank_3_reg = right_bank_3_reg;
6575 left_bank_4_reg = right_bank_4_reg;
6576 }
6577 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
6578 TFLITE_DCHECK_NE(residual_width, kFourOverStride);
6579
6580 // No need to load next ("right") block of data.
6581
6582 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6583 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6584
6585 // Iterate over input width shifts within 4x4 blocks.
6586 {
6587 acc0 = adjusted_bias_data;
6588 acc1 = adjusted_bias_data;
6589
6590 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6591 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6592 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6593 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6594 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6595 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6596
6597 // Fixed-point multiplication.
6598 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6599 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6600 acc0, output_shift);
6601 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6602 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6603 acc1, output_shift);
6604 // Add the output offset.
6605 int16x8_t acc_s16_0_1 =
6606 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6607 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6608 // Apply the activation function.
6609 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6610 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6611 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6612
6613 vst1_lane_s8x4(output_data_base, acc_u8, 0);
6614 vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8,
6615 1);
6616
6617 left_bank_0_reg = vreinterpretq_s8_u16(
6618 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
6619 left_bank_1_reg = vreinterpretq_s8_u16(
6620 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
6621 left_bank_2_reg = vreinterpretq_s8_u16(
6622 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
6623 left_bank_3_reg = vreinterpretq_s8_u16(
6624 vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
6625 left_bank_4_reg = vreinterpretq_s8_u16(
6626 vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
6627 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6628 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6629 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6630 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6631 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6632 }
6633 }
6634 bias_data += kBiasIncrement;
6635 }
6636 } else {
6637 // block_height == 1.
6638 int8x16_t filter_reg_0_a;
6639 int8x16_t filter_reg_1_a;
6640 int8x16_t filter_reg_2_a;
6641 int8x16_t filter_reg_0_b;
6642 int8x16_t filter_reg_1_b;
6643 int8x16_t filter_reg_2_b;
6644
6645 filter_reg_0_a = vld1q_s8(filter_block);
6646 filter_reg_1_a = vld1q_s8(filter_block + 32);
6647 filter_reg_2_a = vld1q_s8(filter_block + 64);
6648 filter_reg_0_b = vld1q_s8(filter_block + 16);
6649 filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
6650 filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
6651
6652 const int8* scratch_data =
6653 scratch_block_data + depth_micro_stride * j_depth;
6654 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6655 output_data = output_block_data + 8 * j_depth;
6656 const int8* input_data_0 = scratch_data;
6657
6658 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6659 bias_data += kBiasIncrement;
6660 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6661 bias_data += kBiasIncrement;
6662
6663 const int32x4_t output_shift_a =
6664 vld1q_s32(output_shift_per_channel + j_depth * 8);
6665 const int32x4_t output_multiplier_a =
6666 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6667 const int32x4_t output_shift_b =
6668 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6669 const int32x4_t output_multiplier_b =
6670 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6671
6672 // Load first sub-micro block of data into operational banks.
6673 int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
6674 int8x16_t left_bank_1_reg_a =
6675 vld1q_s8(input_data_0 + workspace_height_stride);
6676 int8x16_t left_bank_2_reg_a =
6677 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6678 int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
6679 int8x16_t left_bank_1_reg_b =
6680 vld1q_s8(input_data_0 + workspace_height_stride + 16);
6681 int8x16_t left_bank_2_reg_b =
6682 vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
6683
6684 int8x16_t right_bank_0_reg_a;
6685 int8x16_t right_bank_1_reg_a;
6686 int8x16_t right_bank_2_reg_a;
6687 int8x16_t right_bank_0_reg_b;
6688 int8x16_t right_bank_1_reg_b;
6689 int8x16_t right_bank_2_reg_b;
6690
6691 int32x4_t acc0_a;
6692 int32x4_t acc0_b;
6693
6694 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6695 ++i_width) {
6696 const int output_width = i_width == output_width_micro_repeats
6697 ? residual_width
6698 : kFourOverStride;
6699 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6700 const int8* input_data = input_data_0 + width_micro_stride * i_width;
6701 const bool no_right_block = i_width == output_width_micro_repeats &&
6702 output_width_overall_micro_repeats ==
6703 workspace_width_micro_repeats;
6704
6705 if (!no_right_block) {
6706 // Load next sub-micro block of data.
6707 right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
6708 right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
6709 workspace_height_stride);
6710 right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
6711 2 * workspace_height_stride);
6712 right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
6713 right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
6714 workspace_height_stride + 16);
6715 right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
6716 2 * workspace_height_stride + 16);
6717 }
6718
6719 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6720 output_data_base = output_data + depth * 2 * i_width;
6721
6722 // Iterate over input width shifts within 4x4 blocks.
6723 {
6724 acc0_a = adjusted_bias_data_a;
6725 acc0_b = adjusted_bias_data_b;
6726
6727 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6728 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6729 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6730 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6731 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6732 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6733
6734 // Fixed-point multiplication.
6735 acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6736 acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6737 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6738 acc0_a, output_shift_a);
6739 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6740 acc0_b, output_shift_b);
6741 // Add the output offset.
6742 int16x8_t acc_s16_0_1 =
6743 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6744 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6745 // Apply the activation function.
6746 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6747 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6748 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6749
6750 vst1_s8(output_data_base, acc_u8);
6751
6752 left_bank_0_reg_a = vreinterpretq_s8_u16(
6753 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
6754 left_bank_1_reg_a = vreinterpretq_s8_u16(
6755 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
6756 left_bank_2_reg_a = vreinterpretq_s8_u16(
6757 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
6758 left_bank_0_reg_b = vreinterpretq_s8_u16(
6759 vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
6760 left_bank_1_reg_b = vreinterpretq_s8_u16(
6761 vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
6762 left_bank_2_reg_b = vreinterpretq_s8_u16(
6763 vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
6764 vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
6765 vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
6766 vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
6767 vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
6768 vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
6769 vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
6770 }
6771
6772 if (output_width > 1) {
6773 acc0_a = adjusted_bias_data_a;
6774 acc0_b = adjusted_bias_data_b;
6775
6776 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6777 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6778 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6779 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6780 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6781 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6782
6783 // Fixed-point multiplication.
6784 acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6785 acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6786 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6787 acc0_a, output_shift_a);
6788 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6789 acc0_b, output_shift_b);
6790 // Add the output offset.
6791 int16x8_t acc_s16_0_1 =
6792 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6793 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6794 // Apply the activation function.
6795 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6796 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6797 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6798
6799 vst1_s8(output_data_base + depth, acc_u8);
6800
6801 left_bank_0_reg_a = right_bank_0_reg_a;
6802 left_bank_1_reg_a = right_bank_1_reg_a;
6803 left_bank_2_reg_a = right_bank_2_reg_a;
6804 left_bank_0_reg_b = right_bank_0_reg_b;
6805 left_bank_1_reg_b = right_bank_1_reg_b;
6806 left_bank_2_reg_b = right_bank_2_reg_b;
6807 }
6808 }
6809 }
6810 }
6811 } // NOLINT(readability/fn_size) Manually unrolled.
6812
6813 static inline void Run(const int8* scratch_block_data,
6814 const int8* filter_workspace, const int32* bias_data,
6815 int8* output_block_data,
6816 const DepthwiseConvDotProdParams* function_params) {
6817 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6818 output_block_data, function_params);
6819 }
6820 };
6821
6822 template <>
6823 struct KernelMacroBlock<
6824 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6825 QuantizationType::kPerChannelInt8,
6826 DepthwiseConvDepthMultiplication::kUnitInputDepth,
6827 /*stride=*/1> {
6828 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6829 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6830 return vmin_s8(a, b);
6831 }
6832 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6833 return vmax_s8(a, b);
6834 }
6835 static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
6836 return vminq_s8(a, b);
6837 }
6838 static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
6839 return vmaxq_s8(a, b);
6840 }
6841
6842 static inline void KernelMacroBlockIntrinsics(
6843 const int8* scratch_block_data, const int8* filter_workspace,
6844 const int32* bias_data, int8* output_block_data,
6845 const DepthwiseConvDotProdParams* function_params) {
6846 static constexpr QuantizationType quantization_type =
6847 QuantizationType::kPerChannelInt8;
6848
6849 TFLITE_DCHECK_EQ(function_params->stride, 1);
6850 const int workspace_height_stride =
6851 function_params->workspace_height_stride;
6852 const int output_width_micro_repeats =
6853 function_params->output_width_micro_repeats;
6854 const int depth_micro_repeats = function_params->depth_micro_repeats;
6855 const int output_depth = function_params->output_depth;
6856
6857 const int output_width_overall_micro_repeats =
6858 function_params->output_width_overall_micro_repeats;
6859 const int block_height = function_params->outbound_block_height;
6860 const int residual_width = function_params->output_residual_width;
6861 const int output_height_stride = function_params->output_height_stride;
6862 constexpr int kBiasIncrement = 4;
6863
6864 TFLITE_DCHECK(depth_micro_repeats > 0);
6865
6866 const int32 output_activation_min =
6867 function_params->quantized_activation_min;
6868 const int32 output_activation_max =
6869 function_params->quantized_activation_max;
6870 const int32 output_offset = function_params->output_offset;
6871 const int32* output_shift_per_channel =
6872 function_params->output_shift_per_channel;
6873 const int32* output_multiplier_per_channel =
6874 function_params->output_multiplier_per_channel;
6875 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6876 TFLITE_DCHECK_GE(output_activation_min, 0);
6877 TFLITE_DCHECK_LT(output_activation_min, 256);
6878 TFLITE_DCHECK_GE(output_activation_max, 0);
6879 TFLITE_DCHECK_LT(output_activation_max, 256);
6880 } else {
6881 TFLITE_DCHECK_GE(output_activation_min, -128);
6882 TFLITE_DCHECK_LT(output_activation_min, 128);
6883 TFLITE_DCHECK_GE(output_activation_max, -128);
6884 TFLITE_DCHECK_LT(output_activation_max, 128);
6885 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6886 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6887 }
6888 TFLITE_DCHECK_GE(output_offset, -32878);
6889 TFLITE_DCHECK_LT(output_offset, 32768);
6890
6891 const int16x8_t output_offset_vec =
6892 vdupq_n_s16(static_cast<int16>(output_offset));
6893 const int8x16_t output_activation_min_vec =
6894 vdupq_n_s8(static_cast<int8>(output_activation_min));
6895 const int8x16_t output_activation_max_vec =
6896 vdupq_n_s8(static_cast<int8>(output_activation_max));
6897
6898 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6899 output_data_depthwise = output_block_data;
6900 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6901 // Simulate NEON-register transposition of subset of filter.
6902 int8x16_t filter_reg_0_a;
6903 int8x16_t filter_reg_0_b;
6904 int8x16_t filter_reg_1_a;
6905 int8x16_t filter_reg_1_b;
6906 int8x16_t filter_reg_2_a;
6907 int8x16_t filter_reg_2_b;
6908 int8x16_t filter_reg_0_a_shifted;
6909 int8x16_t filter_reg_1_a_shifted;
6910 int8x16_t filter_reg_2_a_shifted;
6911
6912 filter_reg_0_a = vld1q_s8(filter_workspace);
6913 filter_workspace += 16;
6914 filter_reg_0_b = vld1q_s8(filter_workspace);
6915 filter_workspace += 16;
6916 filter_reg_1_a = vld1q_s8(filter_workspace);
6917 filter_workspace += 16;
6918 filter_reg_1_b = vld1q_s8(filter_workspace);
6919 filter_workspace += 16;
6920 filter_reg_2_a = vld1q_s8(filter_workspace);
6921 filter_workspace += 16;
6922 filter_reg_2_b = vld1q_s8(filter_workspace);
6923 filter_workspace += 16;
6924
6925 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
6926 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
6927 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
6928 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
6929 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
6930 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
6931
6932 // When output_width_micro_repeats < output_width_overall_micro_repeats,
6933 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
6934 // residual_width < 2.
6935 const int adjusted_width_micro_repeats =
6936 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
6937 (residual_width < 4)
6938 ? output_width_micro_repeats
6939 : output_width_overall_micro_repeats;
6940
6941 if (block_height == 4) {
6942 for (int s = 0; s < 2; ++s) {
6943 // Work through one slice, by row, at a time.
6944 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6945 output_data_base = output_data_depthwise + 4 * s;
6946
6947 const int8* next_input_data = scratch_block_data;
6948 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6949 output_data = output_data_base;
6950
6951 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6952 bias_data += kBiasIncrement;
6953
6954 const int32x4_t output_shift =
6955 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6956 const int32x4_t output_multiplier =
6957 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6958
6959 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
6960 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
6961 int8x16_t input_bank_c_reg; // left 4, right 4, left 5, right 5.
6962
6963 // Load first sub-micro block of data into operational banks.
6964 input_bank_a_reg =
6965 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
6966 // uninitialized variable.
6967 input_bank_a_reg = vld1q_lane_8x4(
6968 next_input_data + workspace_height_stride, input_bank_a_reg, 2);
6969 input_bank_b_reg = vld1q_dup_s8x4(
6970 next_input_data +
6971 2 * workspace_height_stride); // Load lane 0, avoiding
6972 // uninitialized variable.
6973 input_bank_b_reg =
6974 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6975 input_bank_b_reg, 2);
6976 input_bank_c_reg = vld1q_dup_s8x4(
6977 next_input_data +
6978 4 * workspace_height_stride); // Load lane 0, avoiding
6979 // uninitialized variable.
6980 input_bank_c_reg =
6981 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6982 input_bank_c_reg, 2);
6983
6984 int32x4_t acc0;
6985 int32x4_t acc1;
6986 int32x4_t acc2;
6987 int32x4_t acc3;
6988
6989 acc0 = adjusted_bias_data;
6990 acc1 = adjusted_bias_data;
6991 acc2 = adjusted_bias_data;
6992 acc3 = adjusted_bias_data;
6993
6994 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
6995 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
6996 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
6997 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
6998
6999 int i_width = 0;
7000 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7001 next_input_data += 4;
7002
7003 // Iterate over input width shifts within 4x4 blocks.
7004 {
7005 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7006 0);
7007 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7008 2);
7009 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7010 2);
7011 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7012 2);
7013 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7014 2);
7015 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7016 0);
7017 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7018 0);
7019 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7020 2);
7021
7022 // Fixed-point multiplication.
7023 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7024 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7025 acc0, output_shift);
7026 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7027 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7028 acc1, output_shift);
7029 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7030 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7031 acc2, output_shift);
7032 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7033 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7034 acc3, output_shift);
7035 // Add the output offset.
7036 int16x8_t acc_s16_0_1 =
7037 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7038 int16x8_t acc_s16_2_3 =
7039 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7040 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7041 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7042 // Apply the activation function.
7043 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7044 vqmovxn_s16(acc_s16_2_3));
7045 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7046 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7047
7048 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7049 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7050 1);
7051 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7052 acc_u8_all, 2);
7053 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7054 acc_u8_all, 3);
7055
7056 output_data += output_depth;
7057 }
7058 // Load next sub-micro block of data.
7059 input_bank_a_reg =
7060 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7061 input_bank_a_reg = vld1q_lane_8x4(
7062 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7063 input_bank_b_reg =
7064 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7065 input_bank_b_reg, 1);
7066 input_bank_b_reg =
7067 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7068 input_bank_b_reg, 3);
7069 input_bank_c_reg =
7070 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7071 input_bank_c_reg, 1);
7072 input_bank_c_reg =
7073 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7074 input_bank_c_reg, 3);
7075
7076 {
7077 acc0 = adjusted_bias_data;
7078 acc1 = adjusted_bias_data;
7079 acc2 = adjusted_bias_data;
7080 acc3 = adjusted_bias_data;
7081
7082 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7083 input_bank_a_reg, 0);
7084 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7085 input_bank_a_reg, 2);
7086 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7087 input_bank_b_reg, 0);
7088 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7089 input_bank_a_reg, 2);
7090 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7091 input_bank_b_reg, 0);
7092 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7093 input_bank_b_reg, 2);
7094 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7095 input_bank_b_reg, 0);
7096 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7097 input_bank_b_reg, 2);
7098 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7099 input_bank_c_reg, 0);
7100 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7101 input_bank_b_reg, 2);
7102 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7103 input_bank_c_reg, 0);
7104 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7105 input_bank_c_reg, 2);
7106
7107 // Fixed-point multiplication.
7108 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7109 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7110 acc0, output_shift);
7111 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7112 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7113 acc1, output_shift);
7114 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7115 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7116 acc2, output_shift);
7117 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7118 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7119 acc3, output_shift);
7120 // Add the output offset.
7121 int16x8_t acc_s16_0_1 =
7122 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7123 int16x8_t acc_s16_2_3 =
7124 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7125 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7126 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7127 // Apply the activation function.
7128 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7129 vqmovxn_s16(acc_s16_2_3));
7130 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7131 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7132
7133 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7134 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7135 1);
7136 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7137 acc_u8_all, 2);
7138 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7139 acc_u8_all, 3);
7140
7141 input_bank_a_reg = vreinterpretq_s8_u64(
7142 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7143 input_bank_b_reg = vreinterpretq_s8_u64(
7144 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7145 input_bank_c_reg = vreinterpretq_s8_u64(
7146 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7147
7148 output_data += output_depth;
7149 }
7150
7151 {
7152 acc0 = adjusted_bias_data;
7153 acc1 = adjusted_bias_data;
7154 acc2 = adjusted_bias_data;
7155 acc3 = adjusted_bias_data;
7156
7157 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7158 0);
7159 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7160 2);
7161 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7162 0);
7163 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7164 2);
7165 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7166 0);
7167 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7168 2);
7169 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7170 0);
7171 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7172 2);
7173 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7174 0);
7175 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7176 2);
7177 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7178 0);
7179 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7180 2);
7181
7182 // Fixed-point multiplication.
7183 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7184 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7185 acc0, output_shift);
7186 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7187 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7188 acc1, output_shift);
7189 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7190 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7191 acc2, output_shift);
7192 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7193 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7194 acc3, output_shift);
7195 // Add the output offset.
7196 int16x8_t acc_s16_0_1 =
7197 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7198 int16x8_t acc_s16_2_3 =
7199 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7200 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7201 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7202 // Apply the activation function.
7203 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7204 vqmovxn_s16(acc_s16_2_3));
7205 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7206 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7207
7208 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7209 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7210 1);
7211 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7212 acc_u8_all, 2);
7213 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7214 acc_u8_all, 3);
7215
7216 output_data += output_depth;
7217 }
7218
7219 {
7220 acc0 = adjusted_bias_data;
7221 acc1 = adjusted_bias_data;
7222 acc2 = adjusted_bias_data;
7223 acc3 = adjusted_bias_data;
7224
7225 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7226 input_bank_a_reg, 0);
7227 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7228 input_bank_a_reg, 2);
7229 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7230 input_bank_b_reg, 0);
7231 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7232 input_bank_a_reg, 2);
7233 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7234 input_bank_b_reg, 0);
7235 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7236 input_bank_b_reg, 2);
7237 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7238 input_bank_b_reg, 0);
7239 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7240 input_bank_b_reg, 2);
7241 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7242 input_bank_c_reg, 0);
7243 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7244 input_bank_b_reg, 2);
7245 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7246 input_bank_c_reg, 0);
7247 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7248 input_bank_c_reg, 2);
7249
7250 // Fixed-point multiplication.
7251 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7252 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7253 acc0, output_shift);
7254 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7255 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7256 acc1, output_shift);
7257 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7258 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7259 acc2, output_shift);
7260 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7261 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7262 acc3, output_shift);
7263 // Add the output offset.
7264 int16x8_t acc_s16_0_1 =
7265 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7266 int16x8_t acc_s16_2_3 =
7267 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7268 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7269 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7270 // Apply the activation function.
7271 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7272 vqmovxn_s16(acc_s16_2_3));
7273 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7274 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7275
7276 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7277 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7278 1);
7279 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7280 acc_u8_all, 2);
7281 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7282 acc_u8_all, 3);
7283
7284 input_bank_a_reg = vreinterpretq_s8_u64(
7285 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7286 input_bank_b_reg = vreinterpretq_s8_u64(
7287 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7288 input_bank_c_reg = vreinterpretq_s8_u64(
7289 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7290
7291 output_data += output_depth;
7292 acc0 = adjusted_bias_data;
7293 acc1 = adjusted_bias_data;
7294 acc2 = adjusted_bias_data;
7295 acc3 = adjusted_bias_data;
7296
7297 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7298 0);
7299 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7300 0);
7301 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7302 0);
7303 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7304 2);
7305 }
7306 }
7307
7308 if (i_width < output_width_overall_micro_repeats) {
7309 next_input_data += 4;
7310 const int output_width = residual_width;
7311
7312 // Load next sub-micro block of data.
7313 input_bank_a_reg =
7314 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7315 input_bank_a_reg = vld1q_lane_8x4(
7316 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7317 input_bank_b_reg =
7318 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7319 input_bank_b_reg, 1);
7320 input_bank_b_reg =
7321 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7322 input_bank_b_reg, 3);
7323 input_bank_c_reg =
7324 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7325 input_bank_c_reg, 1);
7326 input_bank_c_reg =
7327 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7328 input_bank_c_reg, 3);
7329
7330 // Iterate over input width shifts within 4x4 blocks.
7331 for (int x = 0; x < output_width; ++x) {
7332 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7333 0);
7334 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7335 2);
7336 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7337 2);
7338 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7339 2);
7340 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7341 2);
7342 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7343 0);
7344 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7345 0);
7346 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7347 2);
7348
7349 // Fixed-point multiplication.
7350 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7351 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7352 acc0, output_shift);
7353 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7354 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7355 acc1, output_shift);
7356 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7357 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7358 acc2, output_shift);
7359 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7360 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7361 acc3, output_shift);
7362 // Add the output offset.
7363 int16x8_t acc_s16_0_1 =
7364 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7365 int16x8_t acc_s16_2_3 =
7366 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7367 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7368 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7369 // Apply the activation function.
7370 int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
7371 vqmovxn_s16(acc_s16_2_3));
7372 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7373 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7374
7375 vst1q_lane_s8x4(output_data, acc_u8_all, 0);
7376 vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
7377 1);
7378 vst1q_lane_s8x4(output_data + 2 * output_height_stride,
7379 acc_u8_all, 2);
7380 vst1q_lane_s8x4(output_data + 3 * output_height_stride,
7381 acc_u8_all, 3);
7382
7383 input_bank_a_reg = vreinterpretq_s8_u64(
7384 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
7385 input_bank_b_reg = vreinterpretq_s8_u64(
7386 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
7387 input_bank_c_reg = vreinterpretq_s8_u64(
7388 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
7389
7390 output_data += output_depth;
7391
7392 acc0 = adjusted_bias_data;
7393 acc1 = adjusted_bias_data;
7394 acc2 = adjusted_bias_data;
7395 acc3 = adjusted_bias_data;
7396
7397 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7398 0);
7399 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7400 0);
7401 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7402 0);
7403 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7404 2);
7405 }
7406 }
7407 // scratch_block_data += 4 * workspace_height_stride;
7408 output_data_base += 4 * output_height_stride;
7409
7410 // Move to next sub-block: advance to second set of filters, to new
7411 // bias.
7412 filter_reg_0_a = filter_reg_0_b;
7413 filter_reg_1_a = filter_reg_1_b;
7414 filter_reg_2_a = filter_reg_2_b;
7415 filter_reg_0_a_shifted = vreinterpretq_s8_u32(
7416 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
7417 filter_reg_1_a_shifted = vreinterpretq_s8_u32(
7418 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
7419 filter_reg_2_a_shifted = vreinterpretq_s8_u32(
7420 vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
7421 }
7422 } else {
7423 // Block height < 4.
7424 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7425 output_data_base = output_data_depthwise;
7426
7427 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
7428 bias_data += kBiasIncrement;
7429 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
7430 bias_data += kBiasIncrement;
7431
7432 const int32x4_t output_shift_a =
7433 vld1q_s32(output_shift_per_channel + j_depth * 8);
7434 const int32x4_t output_multiplier_a =
7435 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7436 const int32x4_t output_shift_b =
7437 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7438 const int32x4_t output_multiplier_b =
7439 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7440
7441 for (int k_height = 0; k_height < block_height; ++k_height) {
7442 const int8* next_input_data =
7443 scratch_block_data + k_height * workspace_height_stride;
7444 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7445 output_data = output_data_base;
7446
7447 int8x16_t input_bank_p_reg; // left 0, right 0, left 1, right 1.
7448 int8x16_t input_bank_q_reg; // left 2, right 2, left 3, right 3.
7449
7450 // Load first sub-micro block of data into operational banks.
7451 input_bank_p_reg =
7452 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
7453 // uninitialized variable.
7454 input_bank_p_reg = vld1q_lane_8x4(
7455 next_input_data + workspace_height_stride, input_bank_p_reg, 2);
7456 input_bank_q_reg = vld1q_dup_s8x4(
7457 next_input_data +
7458 2 * workspace_height_stride); // Load lane 0, avoiding
7459 // uninitialized variable.
7460
7461 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7462 ++i_width) {
7463 next_input_data += 4;
7464 const int output_width =
7465 i_width == output_width_micro_repeats ? residual_width : 4;
7466
7467 // Load next sub-micro block of data.
7468 input_bank_p_reg =
7469 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
7470 input_bank_p_reg = vld1q_lane_8x4(
7471 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
7472 input_bank_q_reg =
7473 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7474 input_bank_q_reg, 1);
7475 // Iterate over input width shifts within 4x4 blocks.
7476 for (int x = 0; x < output_width; ++x) {
7477 int32x4_t acc_a = adjusted_bias_data_a;
7478 int32x4_t acc_b = adjusted_bias_data_b;
7479 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
7480 input_bank_p_reg, 0);
7481 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
7482 input_bank_p_reg, 2);
7483 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
7484 input_bank_q_reg, 0);
7485 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
7486 input_bank_p_reg, 0);
7487 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
7488 input_bank_p_reg, 2);
7489 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
7490 input_bank_q_reg, 0);
7491
7492 // Fixed-point multiplication.
7493 acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
7494 acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
7495 acc_a =
7496 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7497 acc_a, output_shift_a);
7498 acc_b =
7499 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7500 acc_b, output_shift_b);
7501 // Add the output offset.
7502 int16x8_t acc_s16_0_0 =
7503 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
7504 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
7505 // Apply the activation function.
7506 int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
7507 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
7508 vget_low_s8(output_activation_min_vec));
7509 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
7510 vget_low_s8(output_activation_max_vec));
7511
7512 vst1_s8(output_data, acc_u8_0_0);
7513
7514 input_bank_p_reg = vreinterpretq_s8_u64(
7515 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
7516 input_bank_q_reg = vreinterpretq_s8_u64(
7517 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
7518
7519 output_data += output_depth;
7520 }
7521 }
7522 output_data_base += output_height_stride;
7523 }
7524 }
7525 output_data_depthwise += 8;
7526 }
7527 } // NOLINT(readability/fn_size) Manually unrolled.
7528
7529 static inline void Run(const int8* scratch_block_data,
7530 const int8* filter_workspace, const int32* bias_data,
7531 int8* output_block_data,
7532 const DepthwiseConvDotProdParams* function_params) {
7533 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7534 output_block_data, function_params);
7535 }
7536 };
7537
7538 template <>
7539 struct KernelMacroBlock<
7540 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
7541 QuantizationType::kPerChannelInt8,
7542 DepthwiseConvDepthMultiplication::kUnitInputDepth,
7543 /*stride=*/2> {
7544 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
7545 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
7546 return vmin_s8(a, b);
7547 }
7548 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
7549 return vmax_s8(a, b);
7550 }
7551
7552 static inline void KernelMacroBlockIntrinsics(
7553 const int8* scratch_block_data, const int8* filter_workspace,
7554 const int32* bias_data, int8* output_block_data,
7555 const DepthwiseConvDotProdParams* function_params) {
7556 static constexpr QuantizationType quantization_type =
7557 QuantizationType::kPerChannelInt8;
7558
7559 const int workspace_height_stride =
7560 function_params->workspace_height_stride;
7561 const int output_width_micro_repeats =
7562 function_params->output_width_micro_repeats;
7563 const int depth_micro_repeats = function_params->depth_micro_repeats;
7564 const int output_depth = function_params->output_depth;
7565 constexpr int kStrideVal = 2;
7566 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
7567
7568 const int output_width_overall_micro_repeats =
7569 function_params->output_width_overall_micro_repeats;
7570 const int block_height = function_params->outbound_block_height;
7571 const int residual_width = function_params->output_residual_width;
7572 const int output_height_stride = function_params->output_height_stride;
7573 constexpr int kBiasIncrement = 4;
7574
7575 const int32 output_activation_min =
7576 function_params->quantized_activation_min;
7577 const int32 output_activation_max =
7578 function_params->quantized_activation_max;
7579 const int32 output_offset = function_params->output_offset;
7580 const int32* output_shift_per_channel =
7581 function_params->output_shift_per_channel;
7582 const int32* output_multiplier_per_channel =
7583 function_params->output_multiplier_per_channel;
7584 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
7585 TFLITE_DCHECK_GE(output_activation_min, 0);
7586 TFLITE_DCHECK_LT(output_activation_min, 256);
7587 TFLITE_DCHECK_GE(output_activation_max, 0);
7588 TFLITE_DCHECK_LT(output_activation_max, 256);
7589 } else {
7590 TFLITE_DCHECK_GE(output_activation_min, -128);
7591 TFLITE_DCHECK_LT(output_activation_min, 128);
7592 TFLITE_DCHECK_GE(output_activation_max, -128);
7593 TFLITE_DCHECK_LT(output_activation_max, 128);
7594 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
7595 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
7596 }
7597 TFLITE_DCHECK_GE(output_offset, -32878);
7598 TFLITE_DCHECK_LT(output_offset, 32768);
7599
7600 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
7601
7602 const int16x8_t output_offset_vec =
7603 vdupq_n_s16(static_cast<int16>(output_offset));
7604 const int8x16_t output_activation_min_vec =
7605 vdupq_n_s8(static_cast<int8>(output_activation_min));
7606 const int8x16_t output_activation_max_vec =
7607 vdupq_n_s8(static_cast<int8>(output_activation_max));
7608
7609 for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
7610 int8x16_t filter_reg_0_a;
7611 int8x16_t filter_reg_0_b;
7612 int8x16_t filter_reg_1_a;
7613 int8x16_t filter_reg_1_b;
7614 int8x16_t filter_reg_2_a;
7615 int8x16_t filter_reg_2_b;
7616
7617 filter_reg_0_a = vld1q_s8(filter_workspace);
7618 filter_workspace += 16;
7619 filter_reg_0_b = vld1q_s8(filter_workspace);
7620 filter_workspace += 16;
7621 filter_reg_1_a = vld1q_s8(filter_workspace);
7622 filter_workspace += 16;
7623 filter_reg_1_b = vld1q_s8(filter_workspace);
7624 filter_workspace += 16;
7625 filter_reg_2_a = vld1q_s8(filter_workspace);
7626 filter_workspace += 16;
7627 filter_reg_2_b = vld1q_s8(filter_workspace);
7628 filter_workspace += 16;
7629
7630 const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
7631 bias_data += kBiasIncrement;
7632 const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
7633 bias_data += kBiasIncrement;
7634
7635 const int32x4_t output_shift_s_0 =
7636 vld1q_s32(output_shift_per_channel + j_depth * 8);
7637 const int32x4_t output_multiplier_s_0 =
7638 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7639 const int32x4_t output_shift_s_1 =
7640 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7641 const int32x4_t output_multiplier_s_1 =
7642 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7643
7644 if (block_height == 2) {
7645 const int8* scratch_data = scratch_block_data;
7646 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7647 output_data = output_block_data + 8 * j_depth;
7648
7649 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
7650 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
7651 int8x16_t input_bank_c_reg; // left 4, right 4, xxx, xxx.
7652
7653 // Load first sub-micro block of data into operational banks.
7654 input_bank_a_reg =
7655 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
7656 // uninitialized variable.
7657 input_bank_a_reg = vld1q_lane_8x4(
7658 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7659 input_bank_b_reg = vld1q_dup_s8x4(
7660 scratch_data +
7661 2 * workspace_height_stride); // Load lane 0, avoiding
7662 // uninitialized variable.
7663 input_bank_b_reg = vld1q_lane_8x4(
7664 scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
7665 input_bank_c_reg = vld1q_dup_s8x4(
7666 scratch_data +
7667 4 * workspace_height_stride); // Load lane 0, avoiding
7668 // uninitialized variable.
7669
7670 int32x4_t acc0;
7671 int32x4_t acc1;
7672
7673 // When output_width_micro_repeats < output_width_overall_micro_repeats,
7674 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
7675 // residual_width < 2.
7676 const int adjusted_width_micro_repeats =
7677 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
7678 (residual_width < 2)
7679 ? output_width_micro_repeats
7680 : output_width_overall_micro_repeats;
7681
7682 int i_width = 0;
7683 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7684 const int8* input_data = scratch_data + 4 + 4 * i_width;
7685
7686 // Load next sub-micro block of data.
7687 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7688 input_bank_a_reg = vld1q_lane_8x4(
7689 input_data + workspace_height_stride, input_bank_a_reg, 3);
7690 input_bank_b_reg = vld1q_lane_8x4(
7691 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7692 input_bank_b_reg = vld1q_lane_8x4(
7693 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7694 input_bank_c_reg = vld1q_lane_8x4(
7695 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7696
7697 int16x8_t acc_s16_0_1;
7698 int8x8_t acc_u8_0_1;
7699 // Iterate over input width shifts within 4x4 blocks.
7700 {
7701 acc0 = adjusted_bias_data_s_0;
7702 acc1 = adjusted_bias_data_s_0;
7703
7704 acc0 =
7705 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7706 acc0 =
7707 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7708 acc0 =
7709 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7710 acc1 =
7711 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7712 acc1 =
7713 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7714 acc1 =
7715 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7716
7717 // Fixed-point multiplication.
7718 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7719 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7720 acc0, output_shift_s_0);
7721 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7722 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7723 acc1, output_shift_s_0);
7724 // Add the output offset.
7725 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7726 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7727 // Apply the activation function.
7728 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7729 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7730 vget_low_s8(output_activation_min_vec));
7731 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7732 vget_low_s8(output_activation_max_vec));
7733
7734 vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7735 vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7736
7737 acc0 = adjusted_bias_data_s_1;
7738 acc1 = adjusted_bias_data_s_1;
7739
7740 acc0 =
7741 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7742 acc0 =
7743 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7744 acc0 =
7745 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7746 acc1 =
7747 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7748 acc1 =
7749 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7750 acc1 =
7751 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7752
7753 // Fixed-point multiplication.
7754 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7755 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7756 acc0, output_shift_s_1);
7757 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7758 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7759 acc1, output_shift_s_1);
7760 // Add the output offset.
7761 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7762 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7763 // Apply the activation function.
7764 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7765 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7766 vget_low_s8(output_activation_min_vec));
7767 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7768 vget_low_s8(output_activation_max_vec));
7769
7770 vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7771 vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7772 1);
7773
7774 input_bank_a_reg = vreinterpretq_s8_u64(
7775 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7776 input_bank_b_reg = vreinterpretq_s8_u64(
7777 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7778 input_bank_c_reg = vreinterpretq_s8_u64(
7779 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7780
7781 output_data += output_depth;
7782 }
7783
7784 // output_width == four_over_stride.
7785 acc0 = adjusted_bias_data_s_0;
7786 acc1 = adjusted_bias_data_s_0;
7787
7788 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7789 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7790 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7791 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7792 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7793 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7794
7795 // Fixed-point multiplication.
7796 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7797 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7798 acc0, output_shift_s_0);
7799 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7800 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7801 acc1, output_shift_s_0);
7802 // Add the output offset.
7803 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7804 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7805 // Apply the activation function.
7806 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7807 acc_u8_0_1 =
7808 util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7809 acc_u8_0_1 =
7810 util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7811
7812 vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7813 vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7814
7815 acc0 = adjusted_bias_data_s_1;
7816 acc1 = adjusted_bias_data_s_1;
7817
7818 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7819 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7820 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7821 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7822 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7823 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7824
7825 // Fixed-point multiplication.
7826 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7827 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7828 acc0, output_shift_s_1);
7829 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7830 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7831 acc1, output_shift_s_1);
7832 // Add the output offset.
7833 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7834 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7835 // Apply the activation function.
7836 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7837 acc_u8_0_1 =
7838 util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7839 acc_u8_0_1 =
7840 util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7841
7842 vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7843 vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
7844
7845 input_bank_a_reg = vreinterpretq_s8_u64(
7846 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7847 input_bank_b_reg = vreinterpretq_s8_u64(
7848 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7849 input_bank_c_reg = vreinterpretq_s8_u64(
7850 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7851
7852 output_data += output_depth;
7853 }
7854 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
7855 // output_width == 1.
7856 const int8* input_data = scratch_data + 4 + 4 * i_width;
7857
7858 // Load next sub-micro block of data.
7859 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7860 input_bank_a_reg = vld1q_lane_8x4(
7861 input_data + workspace_height_stride, input_bank_a_reg, 3);
7862 input_bank_b_reg = vld1q_lane_8x4(
7863 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7864 input_bank_b_reg = vld1q_lane_8x4(
7865 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7866 input_bank_c_reg = vld1q_lane_8x4(
7867 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7868
7869 int16x8_t acc_s16_0_1;
7870 int8x8_t acc_u8_0_1;
7871 // Iterate over input width shifts within 4x4 blocks.
7872 {
7873 acc0 = adjusted_bias_data_s_0;
7874 acc1 = adjusted_bias_data_s_0;
7875
7876 acc0 =
7877 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7878 acc0 =
7879 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7880 acc0 =
7881 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7882 acc1 =
7883 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7884 acc1 =
7885 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7886 acc1 =
7887 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7888
7889 // Fixed-point multiplication.
7890 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7891 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7892 acc0, output_shift_s_0);
7893 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7894 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7895 acc1, output_shift_s_0);
7896 // Add the output offset.
7897 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7898 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7899 // Apply the activation function.
7900 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7901 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7902 vget_low_s8(output_activation_min_vec));
7903 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7904 vget_low_s8(output_activation_max_vec));
7905
7906 vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
7907 vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7908
7909 acc0 = adjusted_bias_data_s_1;
7910 acc1 = adjusted_bias_data_s_1;
7911
7912 acc0 =
7913 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7914 acc0 =
7915 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7916 acc0 =
7917 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7918 acc1 =
7919 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7920 acc1 =
7921 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7922 acc1 =
7923 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7924
7925 // Fixed-point multiplication.
7926 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7927 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7928 acc0, output_shift_s_1);
7929 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7930 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7931 acc1, output_shift_s_1);
7932 // Add the output offset.
7933 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7934 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7935 // Apply the activation function.
7936 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7937 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7938 vget_low_s8(output_activation_min_vec));
7939 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7940 vget_low_s8(output_activation_max_vec));
7941
7942 vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
7943 vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7944 1);
7945
7946 input_bank_a_reg = vreinterpretq_s8_u64(
7947 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
7948 input_bank_b_reg = vreinterpretq_s8_u64(
7949 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
7950 input_bank_c_reg = vreinterpretq_s8_u64(
7951 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
7952
7953 output_data += output_depth;
7954 }
7955 }
7956 } else {
7957 TFLITE_DCHECK_EQ(block_height, 1);
7958 // Work through one slice, by row, at a time.
7959 const int8* scratch_data = scratch_block_data;
7960 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7961 output_data = output_block_data + 8 * j_depth;
7962
7963 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
7964 int8x16_t input_bank_b_reg; // left 2, right 2, xxx, xxx.
7965
7966 // Load first sub-micro block of data into operational banks.
7967 input_bank_a_reg =
7968 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
7969 // uninitialized variable.
7970 input_bank_a_reg = vld1q_lane_8x4(
7971 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7972 input_bank_b_reg = vld1q_dup_s8x4(
7973 scratch_data +
7974 2 * workspace_height_stride); // Load lane 0, avoiding
7975 // uninitialized variable.
7976
7977 int32x4_t acc0;
7978 int32x4_t acc1;
7979
7980 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7981 ++i_width) {
7982 const int output_width =
7983 i_width == output_width_micro_repeats ? residual_width : 2;
7984
7985 TFLITE_DCHECK_LE(output_width, 2);
7986 TFLITE_DCHECK_GE(output_width, 1);
7987 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
7988 const int8* input_data = scratch_data + 4 + 4 * i_width;
7989
7990 // Load next sub-micro block of data.
7991 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7992 input_bank_a_reg = vld1q_lane_8x4(
7993 input_data + workspace_height_stride, input_bank_a_reg, 3);
7994 input_bank_b_reg = vld1q_lane_8x4(
7995 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7996
7997 int16x8_t acc_s16_0_1;
7998 int8x8_t acc_u8_0_1;
7999
8000 // Iterate over input width shifts within 4x4 blocks.
8001 {
8002 acc0 = adjusted_bias_data_s_0;
8003
8004 acc0 =
8005 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
8006 acc0 =
8007 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
8008 acc0 =
8009 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
8010
8011 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
8012 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8013 acc0, output_shift_s_0);
8014
8015 // Second sub-block accumulation.
8016 acc1 = adjusted_bias_data_s_1;
8017
8018 acc1 =
8019 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
8020 acc1 =
8021 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
8022 acc1 =
8023 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
8024
8025 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
8026 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8027 acc1, output_shift_s_1);
8028
8029 // Add the output offset.
8030 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
8031 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
8032 // Apply the activation function.
8033 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
8034 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
8035 vget_low_s8(output_activation_min_vec));
8036 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
8037 vget_low_s8(output_activation_max_vec));
8038
8039 // This stores the results for both sub-blocks together.
8040 vst1_s8(output_data, acc_u8_0_1);
8041
8042 input_bank_a_reg = vreinterpretq_s8_u64(
8043 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
8044 input_bank_b_reg = vreinterpretq_s8_u64(
8045 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
8046
8047 output_data += output_depth;
8048 }
8049 if (output_width == 2) {
8050 acc0 = adjusted_bias_data_s_0;
8051
8052 acc0 =
8053 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
8054 acc0 =
8055 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
8056 acc0 =
8057 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
8058
8059 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
8060 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8061 acc0, output_shift_s_0);
8062
8063 // Second sub-block accumulation.
8064 acc1 = adjusted_bias_data_s_1;
8065
8066 acc1 =
8067 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
8068 acc1 =
8069 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
8070 acc1 =
8071 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
8072
8073 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
8074 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
8075 acc1, output_shift_s_1);
8076
8077 // Add the output offset.
8078 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
8079 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
8080 // Apply the activation function.
8081 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
8082 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
8083 vget_low_s8(output_activation_min_vec));
8084 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
8085 vget_low_s8(output_activation_max_vec));
8086
8087 // This stores the results for both sub-blocks together.
8088 vst1_s8(output_data, acc_u8_0_1);
8089
8090 input_bank_a_reg = vreinterpretq_s8_u64(
8091 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
8092 input_bank_b_reg = vreinterpretq_s8_u64(
8093 vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
8094
8095 output_data += output_depth;
8096 }
8097 }
8098 }
8099 }
8100 }
8101
8102 static inline void Run(const int8* scratch_block_data,
8103 const int8* filter_workspace, const int32* bias_data,
8104 int8* output_block_data,
8105 const DepthwiseConvDotProdParams* function_params) {
8106 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
8107 output_block_data, function_params);
8108 }
8109 };
8110
8111 #undef vst1_lane_s8x4
8112 #undef vst1_lane_u8x4
8113 #undef vst1q_lane_s8x4
8114 #undef vst1q_lane_u8x4
8115 #undef vld1q_lane_s8x8
8116 #undef vld1_lane_8x4
8117 #undef vld1q_lane_8x4
8118 #undef vld1q_dup_s8x4
8119
8120 #endif // USE_NEON
8121
8122 } // namespace depthwise_conv
8123 } // namespace optimized_ops
8124 } // namespace tflite
8125
8126 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
8127