1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <sys/types.h>
16
17 #include <algorithm>
18 #include <cmath>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstdlib>
22 #include <cstring>
23 #include <limits>
24 #include <utility>
25
26 #include "ruy/ruy.h" // from @ruy
27 #include "tensorflow/lite/kernels/cpu_backend_context.h"
28 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
29 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
30 #include "tensorflow/lite/kernels/internal/common.h"
31 #include "tensorflow/lite/kernels/internal/compatibility.h"
32 #include "tensorflow/lite/kernels/internal/cppmath.h"
33 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
34 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
35
36 #ifdef USE_NEON
37
38 // aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
39 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
40 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
41 // Neither Apple nor Windows provide aligned_alloc.
42 #if !defined(__APPLE__) && !defined(_WIN32)
43 // TODO(miaowang): Re-enable std::aligned_alloc when it is avalaible in Android.
44 // #define TFLITE_USE_STD_ALIGNED_ALLOC
45 #endif
46 #endif
47 #endif
48
49 // Note: This is the same as ABSL_HAVE_BUILTIN, but can't include the header.
50 #ifdef __has_builtin
51 #define TFLITE_HAS_BUILTIN(x) __has_builtin(x)
52 #else
53 #define TFLITE_HAS_BUILTIN(x) 0
54 #endif
55
56 // Note: This is the same as ABSL_PREDICT_FALSE, but can't include the header.
57 #if TFLITE_HAS_BUILTIN(__builtin_expect) || \
58 (defined(__GNUC__) && !defined(__clang__))
59 #define TFLITE_UNLIKELY(x) (__builtin_expect(false || (x), false))
60 #else
61 #define TFLITE_UNLIKELY(x) (x)
62 #endif
63
64 namespace tflite {
65 namespace tensor_utils {
66 namespace {
67
68 constexpr int kFloatValuesPerNeonVector = 4;
69 constexpr int kInt16ValuesPerNeonVector = 8;
70 constexpr int kInt8ValuesPerNeonVector = 16;
71 constexpr int kNeonVectorAlignment = 4;
72 template <int PerNeonSize>
RoundDownVectors(int size)73 inline int RoundDownVectors(int size) {
74 return size & ~(PerNeonSize - 1);
75 }
76
77 // Allocates, at least, size bytes of uninitialized storage whose alignment is
78 // specified by alignment. The size parameter must be an integral multiple of
79 // alignment.
80 // Caller is responsible by freeing the allocated memory by calling free on
81 // the passed freeing_buffer pointer.
aligned_alloc(size_t alignment,size_t size,void ** freeing_buffer)82 inline void* aligned_alloc(size_t alignment, size_t size,
83 void** freeing_buffer) {
84 #ifdef TFLITE_USE_STD_ALIGNED_ALLOC
85 *freeing_buffer = std::aligned_alloc(
86 alignment, (size + alignment - 1) / alignment * alignment);
87 return *freeing_buffer;
88 #else
89 *freeing_buffer = malloc(size + alignment);
90 const size_t offset = ((uintptr_t)*freeing_buffer) % alignment; // NOLINT
91 return offset == 0
92 ? *freeing_buffer
93 : ((char*)*freeing_buffer + (alignment - offset)); // NOLINT
94 #endif
95 }
96
HasSdotInstruction()97 bool HasSdotInstruction() {
98 static const bool has_dotprod = DetectArmNeonDotprod();
99 return has_dotprod;
100 }
101
AccumulateNeonLane(const float32x4_t lane)102 inline float AccumulateNeonLane(const float32x4_t lane) {
103 #ifdef __aarch64__
104 return vaddvq_f32(lane);
105 #else
106 return vgetq_lane_f32(lane, 0) + vgetq_lane_f32(lane, 1) +
107 vgetq_lane_f32(lane, 2) + vgetq_lane_f32(lane, 3);
108 #endif
109 }
110
111 // Empirically determined breakpoints on when to use CpuBackendGemm vs.
112 // standard MatrixBatchVectorMultiplyAccumulate. Briefly, if the batch size
113 // is above 8 and the device does not have sdot, use CpuBackendGemm. Otherwise,
114 // for large batch sizes, it makes sense to use CpuBackendGemm if the matrix
115 // is not extremely rectangular.
UseCpuBackendGemm(int rows,int cols,int batch)116 bool UseCpuBackendGemm(int rows, int cols, int batch) {
117 if (!HasSdotInstruction()) {
118 return batch >= 8;
119 }
120 if (batch < 16) {
121 return false;
122 }
123 constexpr int kCpuBackendGemmThreshold = 2;
124 // Calculate "rectangularness" as a measure of how far from square the
125 // the LHS matrix is.
126 int row_rect = rows / cols;
127 int col_rect = cols / rows;
128 int rectangularness_lg2 =
129 row_rect > 0 ? FloorLog2(row_rect) : FloorLog2(col_rect);
130 int batch_lg2 = FloorLog2(batch);
131 // Large batch sizes move us above the threshold, but can be offset
132 // by significant rectangularness.
133 int batch_lg2_minus_rect_lg2 = batch_lg2 - rectangularness_lg2;
134 return batch_lg2_minus_rect_lg2 > kCpuBackendGemmThreshold;
135 }
136
AccumulateNeonLane(const int32x4_t lane)137 inline int32_t AccumulateNeonLane(const int32x4_t lane) {
138 #ifdef __aarch64__
139 return vaddvq_s32(lane);
140 #else
141 int64x2_t pairwiseAdded = vpaddlq_s32(lane);
142 return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
143 #endif
144 }
145
146 // Single-rounding MultiplyByQuantizedMultiplier
147 #if TFLITE_SINGLE_ROUNDING
MultiplyByQuantizedMultiplier2Rows(int32x4x2_t input_val,int32_t quantized_multiplier,int shift)148 inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
149 int32x4x2_t input_val, int32_t quantized_multiplier, int shift) {
150 TFLITE_DCHECK(quantized_multiplier >= 0);
151 const int right_shift = std::min(-1, shift);
152 const int left_shift = shift - right_shift;
153
154 const int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
155 const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
156 const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
157
158 int32x4x2_t result;
159 result.val[0] = vrshlq_s32(
160 vqdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup),
161 right_shift_dup);
162
163 result.val[1] = vrshlq_s32(
164 vqdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup),
165 right_shift_dup);
166
167 return result;
168 }
169 // Double-rounding MultiplyByQuantizedMultiplier
170 #else
MultiplyByQuantizedMultiplier2Rows(int32x4x2_t input_val,int32 quantized_multiplier,int shift)171 inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
172 int32x4x2_t input_val, int32 quantized_multiplier, int shift) {
173 using gemmlowp::RoundingDivideByPOT;
174 using gemmlowp::SaturatingRoundingDoublingHighMul;
175 const int left_shift = shift > 0 ? shift : 0;
176 const int right_shift = shift > 0 ? 0 : -shift;
177 int32x4x2_t result;
178 // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
179 // is limited to NEON.
180 #ifdef GEMMLOWP_NEON
181 const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
182 result.val[0] =
183 RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
184 vmulq_s32(input_val.val[0], left_shifted_one_dup),
185 quantized_multiplier),
186 right_shift);
187 result.val[1] =
188 RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
189 vmulq_s32(input_val.val[1], left_shifted_one_dup),
190 quantized_multiplier),
191 right_shift);
192 #else
193 for (int i = 0; i < 2; ++i) {
194 int32_t vals[4];
195 vals[0] = RoundingDivideByPOT(
196 SaturatingRoundingDoublingHighMul(
197 vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
198 quantized_multiplier),
199 right_shift);
200 vals[1] = RoundingDivideByPOT(
201 SaturatingRoundingDoublingHighMul(
202 vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
203 quantized_multiplier),
204 right_shift);
205 vals[2] = RoundingDivideByPOT(
206 SaturatingRoundingDoublingHighMul(
207 vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
208 quantized_multiplier),
209 right_shift);
210 vals[3] = RoundingDivideByPOT(
211 SaturatingRoundingDoublingHighMul(
212 vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
213 quantized_multiplier),
214 right_shift);
215
216 result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
217 }
218 #endif
219 return result;
220 }
221 #endif // TFLITE_SINGLE_ROUNDING
222
223 } // namespace
224
NeonMatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)225 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
226 int m_cols, const float* vector,
227 int n_batch, float* result) {
228 // If v_size is not divisible by the vector size, then we need to process the
229 // final few elements sequentially. postamble_start shows the start index
230 // where this should happen.
231 const int postamble_start =
232 RoundDownVectors<kFloatValuesPerNeonVector>(m_cols);
233
234 for (int b = 0; b < n_batch; b++) {
235 float* result_in_batch = result + b * m_rows;
236 const float* vector_in_batch = vector + b * m_cols;
237 const float* matrix_row = matrix;
238
239 // Main matrix by vector multiplication loop
240 for (int r = 0; r < m_rows; r++) {
241 float32x4_t acc_32x4 = vmovq_n_f32(0.0);
242 int c = 0;
243 for (; c < postamble_start; c += kFloatValuesPerNeonVector) {
244 // Load 4 float values from vector and matrix row.
245 float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
246 float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
247 // Multiply the vector and matrix row and add to accumulator.
248 acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
249 }
250 // Add the 4 intermediate sum values to get the final dot-prod value for
251 // this column.
252 *result_in_batch += AccumulateNeonLane(acc_32x4);
253 for (; TFLITE_UNLIKELY(c < m_cols); c++) {
254 *result_in_batch += matrix_row[c] * vector_in_batch[c];
255 }
256 matrix_row += m_cols;
257 ++result_in_batch;
258 }
259 }
260 }
261
262 #ifdef __aarch64__
263
264 // We interleave vector data to make the dot product logic more efficient.
265 // Suppose that vectors is:
266 // a0 a1 a2 a3 a4 a5 ...
267 // b0 b1 b2 b3 b4 b5 ...
268 // c0 c1 c2 c3 c4 c5 ...
269 // d0 d1 d2 d3 d4 d5 ...
270 // e0 e1 e2 e3 e4 e5 ...
271 // This code interleaves them like this:
272 // a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ...
273 // e0 e1 e2 e3 f0 f1 f2 f3 ...
274 // Once the data is interleaved, each 16-byte read from the vectors pointer
275 // contains 4 bytes from each of 4 vectors.
ShuffleVectors(const int8_t * vectors,const int n_batch,const int m_cols,void ** shuffled_vectors_free)276 const int8_t* ShuffleVectors(const int8_t* vectors, const int n_batch,
277 const int m_cols, void** shuffled_vectors_free) {
278 int8* shuffled_vectors = reinterpret_cast<int8*>(aligned_alloc(
279 kNeonVectorAlignment, n_batch * m_cols, shuffled_vectors_free));
280
281 for (int i = 0; i < n_batch; i += 4) {
282 int8* shuffled_vectors_ptr = shuffled_vectors + (i * m_cols);
283 const int8* unshuffled_vec0_ptr =
284 reinterpret_cast<const int8*>(vectors) + (i * m_cols);
285 const int8* unshuffled_vec1_ptr =
286 reinterpret_cast<const int8*>(vectors) + ((i + 1) * m_cols);
287 const int8* unshuffled_vec2_ptr =
288 reinterpret_cast<const int8*>(vectors) + ((i + 2) * m_cols);
289 const int8* unshuffled_vec3_ptr =
290 reinterpret_cast<const int8*>(vectors) + ((i + 3) * m_cols);
291 const int8* const end_vec0_ptr = unshuffled_vec1_ptr;
292
293 while (unshuffled_vec0_ptr != end_vec0_ptr) {
294 asm volatile(
295 // This code path requires that (n_cols % 16) == 0 so we can safely
296 // read in 16-byte chunks from each row.
297 "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
298 "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
299 "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
300 "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
301
302 "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
303 "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
304 "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
305 "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
306
307 : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
308 [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
309 [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
310 [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
311 [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
312 :
313 : "v0", "v1", "v2", "v3", "cc", "memory");
314 }
315 }
316
317 return reinterpret_cast<const int8_t*>(shuffled_vectors);
318 }
319
320 // Notes about the speed of this version vs. the baseline (from memory):
321 // - With 256K of L1, we can keep a lot of vectors in cache.
322 // I recall a reasonable speedup just by rearranging the loop to have
323 // row on the outside and batch on the inside.
324 // - I also recall getting a nice speedup from sdot.
325 // - I tried many times to do better than the current implementation, using
326 // loop unrolling and instruction reordering to avoid stalls, etc.
327 // but I was not able to do significantly better. This code is, however,
328 // much worse than what the processor spec sheet suggests is possible.
DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)329 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
330 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
331 const int8_t* vectors, const float* scaling_factors, int n_batch,
332 float* __restrict__ result) {
333 void* shuffled_vectors_free;
334
335 const int8_t* shuffled_vectors =
336 ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
337
338 for (int row = 0; row < m_rows; row += 2) {
339 for (int batch = 0; batch < n_batch; batch += 4) {
340 float* result_ptr = result + (batch * m_rows) + row;
341 const int8* mat_ptr0 = matrix + (row * m_cols);
342 const int8* mat_ptr1 = matrix + ((row + 1) * m_cols);
343 const int8* mat_ptr0_end = mat_ptr1;
344 const int8* vec_ptr = shuffled_vectors + (batch * m_cols);
345 const float* scaling_factors_ptr = scaling_factors + batch;
346 const uint64_t wide_rows = m_rows * sizeof(float);
347 const int8* mat_ptr2 = matrix + ((row + 2) * m_cols);
348 const int8* mat_ptr3 = matrix + ((row + 3) * m_cols);
349
350 asm volatile(
351 // Zero out the accumulator registers.
352 "movi v0.4s, #0\n"
353 "movi v1.4s, #0\n"
354 "movi v2.4s, #0\n"
355 "movi v3.4s, #0\n"
356
357 "1:\n" // batch_cols_loop
358
359 // Read 16 more bytes from a pair of matrix rows.
360 "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
361
362 // Prefetch two rows ahead.
363 "prfm pldl1strm, [%[mat_ptr2]]\n"
364 "prfm pldl1strm, [%[mat_ptr3]]\n"
365
366 // Read from input vectors 4 times; 64 bytes total.
367 // Each 16-byte register contains parts of 4 vectors; see the
368 // shuffle logic above.
369
370 // From Benoit, places to look in the future:
371 // - Move load instructions further from sdot
372 // - Switch loop use-then-reload
373 // - Do partial unrolling to use register space better
374 "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
375 ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
376 "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
377 ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
378 "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
379 ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
380 "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
381 ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
382
383 // Update prefetch pointers.
384 "add %[mat_ptr2], %[mat_ptr2], #16\n"
385 "add %[mat_ptr3], %[mat_ptr3], #16\n"
386
387 // Re-use those vectors for the next row as well.
388 "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
389 ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
390 ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
391 ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
392 ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
393
394 // If we're not done with these rows, continue.
395 "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
396 "bne 1b\n" // batch_cols_loop
397
398 // Done with the rows, sum the results.
399 "add v0.4s, v0.4s, v1.4s\n"
400 "add v2.4s, v2.4s, v3.4s\n"
401
402 // Convert the per-vector sums to floating point.
403 "scvtf v0.4s, v0.4s\n"
404 "scvtf v1.4s, v2.4s\n"
405
406 // Fetch scale factors.
407 "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
408
409 // Multiply scale factors times sums.
410 "fmul v0.4s, v4.4s, v0.4s\n"
411 "fmul v1.4s, v4.4s, v1.4s\n"
412
413 // Load previous result values.
414 // The result position is:
415 // result[batch * m_rows + row]
416 // Here that is factored into:
417 // result_ptr = result + row
418 // *result_ptr = res[0]
419 // (uint8*)result_ptr += (m_rows * sizeof(float))
420 // *result_ptr = res[1]
421 // ...
422 // Since we're reading two rows at a time, though, we read both
423 // result[batch * m_rows + row]
424 // and
425 // result[batch * m_rows + row + 1]
426 "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
427 "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
428 "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
429 "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
430
431 // Go back to the starting position (subtract wide_rows * 4).
432 "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
433
434 // Add previous result values.
435 "fadd v9.4s, v9.4s, v0.4s\n"
436 "fadd v10.4s, v10.4s, v1.4s\n"
437
438 // Store results.
439 "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
440 "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
441 "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
442 "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
443 : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
444 [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
445 [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
446 : [mat_ptr0_end] "r"(mat_ptr0_end),
447 [scaling_factors_ptr] "r"(scaling_factors_ptr),
448 [wide_rows] "r"(wide_rows)
449 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
450 "v10", "v11", "v12", "v13", "cc", "memory");
451 }
452 }
453
454 free(shuffled_vectors_free);
455 }
456
DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * row_sums)457 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
458 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
459 const int8_t* vectors, const float* scaling_factors, int n_batch,
460 float* __restrict__ result, const float* per_channel_scale,
461 const int32_t* input_offset, int32_t* row_sums) {
462 void* shuffled_vectors_free;
463 const int8_t* shuffled_vectors =
464 ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
465
466 for (int row = 0; row < m_rows; row += 2) {
467 for (int batch = 0; batch < n_batch; batch += 4) {
468 const float* channel_scales_ptr = per_channel_scale + row;
469 int32_t* row_sums_ptr = row_sums ? row_sums + row : nullptr;
470
471 float* result_ptr = result + (batch * m_rows) + row;
472 const int8* mat_ptr0 = matrix + (row * m_cols);
473 const int8* mat_ptr1 = matrix + ((row + 1) * m_cols);
474 const int8* mat_ptr0_end = mat_ptr1;
475 const int8* vec_ptr = shuffled_vectors + (batch * m_cols);
476 const float* scaling_factors_ptr = scaling_factors + batch;
477 const uint64_t wide_rows = m_rows * sizeof(float);
478 const int32_t* batch_offsets_ptr = input_offset + batch;
479 const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
480 const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
481 asm volatile(
482 "movi v0.4s, #0\n"
483 "movi v1.4s, #0\n"
484 "movi v2.4s, #0\n"
485 "movi v3.4s, #0\n"
486 // Load zero points.
487 "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
488 "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
489 // Zero out zero point accumulators.
490 "movi v14.4s, #0\n"
491 "movi v15.4s, #0\n"
492
493 // Load per channel scales if not null.
494 "cmp %w[is_channel_scale_nullptr], #0\n"
495 "bne 1f\n"
496 "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
497 "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
498 "fmul v16.4s, v16.4s, v4.4s\n"
499 "fmul v17.4s, v17.4s, v4.4s\n"
500 "b 2f\n"
501 "1:\n"
502 "mov v16.16b, v4.16b\n"
503 "mov v17.16b, v4.16b\n"
504 "2:\n"
505 "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
506 "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
507 ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
508 "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
509 ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
510 "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
511 ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
512 "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
513 ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
514 "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
515 ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
516 ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
517 ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
518 ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
519 "cmp %w[is_row_sums_nullptr], #1\n"
520 "bne 3f\n"
521 // Accumulate row_sums for zero point calculations.
522 "saddlp v12.8h, v12.16b\n"
523 "saddlp v13.8h, v13.16b\n"
524 "sadalp v14.4s, v12.8h\n"
525 "sadalp v15.4s, v13.8h\n"
526 "3:\n"
527 "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
528 "bne 2b\n"
529 "add v0.4s, v0.4s, v1.4s\n"
530 "add v2.4s, v2.4s, v3.4s\n"
531
532 "cmp %w[is_row_sums_nullptr], #1\n"
533 "bne 4f\n"
534 // Calculate zero point offsets.
535 "addv s14, v14.4s\n"
536 "addv s15, v15.4s\n"
537 "dup v14.4s, v14.s[0]\n"
538 "dup v15.4s, v15.s[0]\n"
539 "b 5f\n"
540 "4:\n"
541 "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
542 "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
543 "5:\n"
544
545 "mul v14.4s, v14.4s, v7.4s\n"
546 "mul v15.4s, v15.4s, v7.4s\n"
547 "sub v0.4s, v0.4s, v14.4s\n"
548 "sub v2.4s, v2.4s, v15.4s\n"
549
550 "scvtf v0.4s, v0.4s\n"
551 "scvtf v1.4s, v2.4s\n"
552
553 // Multiply scale.
554 "fmul v0.4s, v16.4s, v0.4s\n"
555 "fmul v1.4s, v17.4s, v1.4s\n"
556
557 "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
558 "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
559 "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
560 "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
561 "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
562 "fadd v9.4s, v9.4s, v0.4s\n"
563 "fadd v10.4s, v10.4s, v1.4s\n"
564 "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
565 "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
566 "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
567 "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
568 : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
569 [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
570 [row_sums_ptr] "+r"(row_sums_ptr),
571 [channel_scales_ptr] "+r"(channel_scales_ptr)
572 : [mat_ptr0_end] "r"(mat_ptr0_end),
573 [scaling_factors_ptr] "r"(scaling_factors_ptr),
574 [wide_rows] "r"(wide_rows),
575 [batch_offsets_ptr] "r"(batch_offsets_ptr),
576 [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
577 [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
578 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
579 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1",
580 "cc", "memory");
581 }
582 }
583
584 free(shuffled_vectors_free);
585 }
586
DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset)587 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
588 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
589 const int8_t* vectors, const float* scaling_factors, int n_batch,
590 float* __restrict__ result, const float* per_channel_scale,
591 const int32_t* input_offset) {
592 DotprodMatrixBatchFourVectorMultiplyAccumulate(
593 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
594 per_channel_scale, input_offset, nullptr);
595 }
596
597 // The DotprodMatrixBatchFourVectorMultiplyAccumulate kernel processes 4
598 // vectors in the same time as the baseline processes 1 vector. However, it
599 // requires 4 vectors of input.
600 //
601 // To take advantage of this speed difference, we add some zero-valued
602 // vectors to the batch so that n_batch is a multiple of 4. Then we execute
603 // DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate on that padded batch,
604 // then extract just the results we want at the end (ignoring the extra padding
605 // outputs).
606 //
607 // The relative cost of the padding is large when the matrix is smaller than
608 // 128x128, so we don't use this code path on small matrices. On larger
609 // matrices, the computation cost dwarfs the padding cost, making this code
610 // viable.
611 //
612 // If we ignore the cost of padding, this kernel is:
613 // 1x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 1
614 // 2x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 2
615 // 3x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 3
616 // ...
617 //
618 // We don't use this kernel when n_batch = 1 because the baseline kernel
619 // is fine for that case.
DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * row_sums)620 void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
621 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
622 const int8_t* vectors, const float* scaling_factors, int n_batch,
623 float* __restrict__ result, const float* per_channel_scale,
624 const int32_t* input_offset, int32_t* row_sums) {
625 // Round to the nearest multiple of 4.
626 int batch_round_up = n_batch;
627 if (n_batch % 4 != 0) {
628 batch_round_up += (4 - n_batch % 4);
629 }
630 TFLITE_CHECK_LE(n_batch, batch_round_up);
631
632 void* padded_vectors_free;
633 const int padded_vectors_size = batch_round_up * m_cols;
634 int8_t* padded_vectors = reinterpret_cast<int8_t*>(aligned_alloc(
635 kNeonVectorAlignment, padded_vectors_size, &padded_vectors_free));
636 memset(padded_vectors, 0, padded_vectors_size);
637
638 void* padded_result_free;
639 const int result_size = n_batch * m_rows * sizeof(float);
640 const int padded_result_size = batch_round_up * m_rows * sizeof(float);
641 float* padded_result = reinterpret_cast<float*>(aligned_alloc(
642 kNeonVectorAlignment, padded_result_size, &padded_result_free));
643 memcpy(padded_result, result, result_size);
644 memset(reinterpret_cast<char*>(padded_result) + result_size, 0,
645 padded_result_size - result_size);
646
647 // Copy the input into the padded data structure.
648 TFLITE_CHECK_LE(n_batch * m_cols, padded_vectors_size);
649 memcpy(padded_vectors, vectors, n_batch * m_cols);
650
651 void* padded_scaling_factors_free;
652 const int padded_scaling_factors_size = batch_round_up * sizeof(float);
653 float* padded_scaling_factors = reinterpret_cast<float*>(
654 aligned_alloc(kNeonVectorAlignment, padded_scaling_factors_size,
655 &padded_scaling_factors_free));
656 TFLITE_CHECK_LE(n_batch * sizeof(float), padded_scaling_factors_size);
657 TFLITE_CHECK_LE(batch_round_up * sizeof(float), padded_scaling_factors_size);
658 memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
659 memcpy(padded_scaling_factors, scaling_factors, n_batch * sizeof(float));
660
661 if (input_offset != nullptr) {
662 void* padded_input_offset_free;
663 const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
664 int32_t* padded_input_offset = reinterpret_cast<int32_t*>(
665 aligned_alloc(kNeonVectorAlignment, padded_input_offset_size,
666 &padded_input_offset_free));
667 TFLITE_CHECK_LE(n_batch * sizeof(int32_t), padded_input_offset_size);
668 TFLITE_CHECK_LE(batch_round_up * sizeof(int32_t), padded_input_offset_size);
669 memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
670 memcpy(padded_input_offset, input_offset, n_batch * sizeof(int32_t));
671
672 // Call the main kernel.
673 DotprodMatrixBatchFourVectorMultiplyAccumulate(
674 matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors,
675 batch_round_up, padded_result, per_channel_scale, padded_input_offset,
676 row_sums);
677
678 free(padded_input_offset_free);
679 } else {
680 // Call the main kernel.
681 DotprodMatrixBatchFourVectorMultiplyAccumulate(
682 matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors,
683 batch_round_up, padded_result);
684 }
685 memcpy(result, padded_result, result_size);
686
687 free(padded_result_free);
688 free(padded_vectors_free);
689 free(padded_scaling_factors_free);
690 }
691
DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)692 void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
693 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
694 const int8_t* vectors, const float* scaling_factors, int n_batch,
695 float* __restrict__ result) {
696 DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
697 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
698 /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
699 /*row_sums=*/nullptr);
700 }
701
DotprodSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)702 static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
703 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
704 const int m_cols, const int8_t* __restrict__ vectors,
705 const float* scaling_factors, int n_batch, float* __restrict__ result) {
706 const uint8_t* ledger_ptr = ledger;
707 const int8* mat_ptr = matrix;
708
709 for (int row = 0; row < m_rows; row++) {
710 int num_nonzero_chunks = *ledger_ptr;
711 ledger_ptr++;
712 const uint8* ledger_start = ledger_ptr;
713 const uint8* ledger_end = ledger_ptr + num_nonzero_chunks;
714 const int8* mat_start = mat_ptr;
715
716 for (int batch = 0; batch < n_batch; batch++) {
717 const int8* vec_ptr = vectors + (batch * m_cols);
718 int64_t row_sum = 0;
719
720 mat_ptr = mat_start;
721 ledger_ptr = ledger_start;
722
723 if (ledger_ptr != ledger_end) {
724 asm volatile(
725 "movi v0.4s, #0\n"
726 "movi v1.4s, #0\n"
727 "movi v8.4s, #0\n"
728 "mov x7, 0\n"
729
730 "1:\n" // chunks_loop
731
732 // Single matrix chunk, 16 bytes
733 "ld1 {v8.16b}, [%[mat_ptr]], #16\n"
734
735 // Read the next ledger index and increment.
736 "ldrb w7, [%[ledger_ptr]], #1\n"
737
738 // Read 16 bytes of vector data from (vec_ptr + (ledger_index * 16))
739 "add x8, %[vec_ptr], x7, lsl #4\n"
740 "ld1 {v9.16b}, [x8]\n"
741
742 // Dot product of matrix row and vector.
743 ".word 0x4e889520 // sdot v0.4s, v9.16b, v8.16b\n"
744
745 "cmp %[ledger_ptr], %[ledger_end]\n"
746 "blt 1b\n" // chunks_loop
747
748 // Sum the 4 vector components into a 32-bit value.
749 "addv s1, v0.4s\n"
750 // row_sum is 64-bit, so we copy 64 bits of v1 into it.
751 // We have to be careful to cast this value to 32 bits in order
752 // to interpret the sign bit properly.
753 "mov %[row_sum], v1.d[0]\n"
754 : [row_sum] "=r"(row_sum), [ledger_ptr] "+r"(ledger_ptr),
755 [mat_ptr] "+r"(mat_ptr), [vec_ptr] "+r"(vec_ptr)
756 : [ledger_end] "r"(ledger_end)
757 : "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
758 }
759 result[batch * m_rows + row] +=
760 static_cast<int32>(row_sum) * scaling_factors[batch];
761 }
762 }
763 }
764
765 #endif // __aarch64__
766
NeonMatrixBatchVectorMultiplyImpl(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch)767 void NeonMatrixBatchVectorMultiplyImpl(const int8_t* input, const int32_t* bias,
768 const int8_t* input_to_gate_weights,
769 int32_t n_batch, int32_t n_input,
770 int32_t n_output, int32_t output_zp,
771 int32_t* scratch) {
772 // Assuming *matrix is kNeonVectorAlignment-byte aligned, every row of the
773 // matrix is also kNeonVectorAlignment-byte aligned as long as cols is a
774 // multiple of kNeonVectorAlignment. The assumption is currently satisfied by
775 // TFLite's 16-byte memory alignment scheme.
776 //
777 // Otherwise, we allocate an aligned memory block and set
778 // a flag to later copy rows from matrix to the block
779 // for aligned multiplication.
780 bool unaligned = false;
781 int8_t* aligned_row = nullptr;
782 void* aligned_row_free = nullptr;
783 if ((n_input & (kNeonVectorAlignment - 1)) != 0) {
784 unaligned = true;
785 aligned_row =
786 (int8_t*)aligned_alloc(kNeonVectorAlignment, n_input, // NOLINT
787 &aligned_row_free);
788 }
789 void* aligned_vec_free = nullptr;
790 int8_t* aligned_vec =
791 (int8_t*)aligned_alloc(kNeonVectorAlignment, n_input, // NOLINT
792 &aligned_vec_free);
793
794 // If m_cols is not at least kInt8ValuesPerNeonVector, we cannot use the main
795 // vectorized loop, and we need to process sequentially. postamble_half_start
796 // shows the start index where this should happen. Between postamble_start and
797 // postamble_half_start we can still process kInt8ValuesPerNeonVector/2 in a
798 // vectorized form.
799 const int postamble_half_start =
800 RoundDownVectors<kInt8ValuesPerNeonVector>(n_input);
801 const int postamble_start =
802 RoundDownVectors<(kInt8ValuesPerNeonVector / 2)>(n_input);
803
804 for (int batch = 0; batch < n_batch; ++batch) {
805 // Copy the vector data to an aligned vector.
806 memcpy(aligned_vec, input + batch * n_input, sizeof(int8_t) * n_input);
807 // Compute dot-product for every column.
808 for (int row = 0; row < n_output; ++row) {
809 // Get the address of the first element of the row.
810 int8_t* row_ptr =
811 (int8_t*)input_to_gate_weights + row * n_input; // NOLINT
812 if (unaligned) {
813 memcpy(aligned_row, row_ptr, sizeof(int8_t) * n_input);
814 row_ptr = aligned_row;
815 }
816
817 // Initialize the dot product sum for the row to 0.
818 int32x4_t dotprod_32x4 = vmovq_n_s32(0);
819
820 // For every block of 16 8-bit elements.
821 int col = 0;
822 for (; col < postamble_half_start; col += kInt8ValuesPerNeonVector) {
823 // Load 16 8-bit values from the row and vector, each, to operate on.
824 // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
825 // performance may suffer significantly.
826 TFLITE_DCHECK_EQ( // NOLINT
827 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
828 const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
829 const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
830 // Multiply the low bits (i.e. the lower 8 8bit numbers in the
831 // registers).
832 int16x8_t prod_16x8 =
833 vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
834 // Multiply the high bits (i.e. the higher 8 8bit numbers in the
835 // registers), and accumulate with the result of the low bits product.
836 // The assumption here is that overflow will not happen as we quantize
837 // our values to be in the range [-127, 127]. As such the sum of the 2
838 // products is always strictly smaller than 15-bits (32767 in absolute
839 // value).
840 prod_16x8 =
841 vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
842
843 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
844 } // for col
845
846 // Half iteration dealing only 8 elements
847 if (TFLITE_UNLIKELY(col < postamble_start)) {
848 // Load 8 8-bit values from the row and column each to operate on.
849 // Here the assumption is that each buffer is 4-bytes aligned.
850 // Otherwise, performance may suffer significantly.
851 TFLITE_DCHECK_EQ( // NOLINT
852 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
853 const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
854 const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
855 const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
856 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
857 col += (kInt8ValuesPerNeonVector >> 1);
858 }
859 // Add the 4 intermediate sum values to get the final dot-prod value for
860 // this row.
861 int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
862 // Postamble loop.
863 for (; TFLITE_UNLIKELY(col < n_input); ++col) {
864 dotprod += row_ptr[col] * aligned_vec[col];
865 } // for col
866
867 dotprod += bias[row];
868 scratch[batch * n_output + row] = dotprod;
869 } // for row
870 } // for batch
871
872 if (unaligned) {
873 free(aligned_row_free);
874 }
875 free(aligned_vec_free);
876 }
877
NeonMatrixBatchVectorAccumulateImpl(int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output)878 inline void NeonMatrixBatchVectorAccumulateImpl(
879 int32_t multiplier, int32_t shift, int32_t n_batch, int32_t n_output,
880 int32_t output_zp, int32_t* scratch, int16_t* output) {
881 int i = 0;
882 const int total_size = n_batch * n_output;
883
884 const int32_t output_min = std::numeric_limits<int16_t>::min();
885 const int32_t output_max = std::numeric_limits<int16_t>::max();
886
887 const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
888 const int32x4_t max_val_dup = vdupq_n_s32(output_max);
889 const int32x4_t min_val_dup = vdupq_n_s32(output_min);
890
891 using gemmlowp::RoundingDivideByPOT;
892 using gemmlowp::SaturatingRoundingDoublingHighMul;
893
894 for (; i <= total_size - 8; i += 8) {
895 int32x4x2_t scratch_val;
896 scratch_val.val[0] = vld1q_s32(scratch + i);
897 scratch_val.val[1] = vld1q_s32(scratch + i + 4);
898 const int16x8_t output_val = vld1q_s16(output + i);
899 const int32x4_t first_half = vmovl_s16(vget_low_s16(output_val));
900 const int32x4_t second_half = vmovl_s16(vget_high_s16(output_val));
901
902 int32x4x2_t temp_val =
903 MultiplyByQuantizedMultiplier2Rows(scratch_val, multiplier, shift);
904
905 temp_val.val[0] =
906 vaddq_s32(vaddq_s32(temp_val.val[0], first_half), output_zp_dup);
907 temp_val.val[1] =
908 vaddq_s32(vaddq_s32(temp_val.val[1], second_half), output_zp_dup);
909 temp_val.val[0] =
910 vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
911 temp_val.val[1] =
912 vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
913 const int16x8_t result =
914 vcombine_s16(vqmovn_s32(temp_val.val[0]), vqmovn_s32(temp_val.val[1]));
915 vst1q_s16(output + i, result);
916 }
917 for (; TFLITE_UNLIKELY(i < total_size); ++i) {
918 int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
919 temp += output_zp;
920 temp += output[i];
921 if (temp > output_max) {
922 temp = output_max;
923 }
924 if (temp < output_min) {
925 temp = output_min;
926 }
927 output[i] = static_cast<int16_t>(temp);
928 }
929 }
930
NeonMatrixBatchVectorAccumulateImpl(int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output)931 inline void NeonMatrixBatchVectorAccumulateImpl(
932 int32_t multiplier, int32_t shift, int32_t n_batch, int32_t n_output,
933 int32_t output_zp, int32_t* scratch, int8_t* output) {
934 int i = 0;
935 const int total_size = n_batch * n_output;
936
937 const int32_t output_min = std::numeric_limits<int8_t>::min();
938 const int32_t output_max = std::numeric_limits<int8_t>::max();
939
940 const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
941 const int32x4_t max_val_dup = vdupq_n_s32(output_max);
942 const int32x4_t min_val_dup = vdupq_n_s32(output_min);
943
944 using gemmlowp::RoundingDivideByPOT;
945 using gemmlowp::SaturatingRoundingDoublingHighMul;
946
947 for (; i <= total_size - 16; i += 16) {
948 int32x4x4_t scratch_val;
949 scratch_val.val[0] = vld1q_s32(scratch + i);
950 scratch_val.val[1] = vld1q_s32(scratch + i + 4);
951 scratch_val.val[2] = vld1q_s32(scratch + i + 8);
952 scratch_val.val[3] = vld1q_s32(scratch + i + 12);
953
954 const int8x16_t output_val = vld1q_s8(output + i);
955 const int16x8_t first_half = vmovl_s8(vget_low_s8(output_val));
956 const int16x8_t second_half = vmovl_s8(vget_high_s8(output_val));
957 const int32x4_t output_val_1 = vmovl_s16(vget_low_s16(first_half));
958 const int32x4_t output_val_2 = vmovl_s16(vget_high_s16(first_half));
959 const int32x4_t output_val_3 = vmovl_s16(vget_low_s16(second_half));
960 const int32x4_t output_val_4 = vmovl_s16(vget_high_s16(second_half));
961
962 int32x4x4_t temp_val =
963 MultiplyByQuantizedMultiplier4Rows(scratch_val, multiplier, shift);
964
965 temp_val.val[0] =
966 vaddq_s32(vaddq_s32(temp_val.val[0], output_val_1), output_zp_dup);
967 temp_val.val[1] =
968 vaddq_s32(vaddq_s32(temp_val.val[1], output_val_2), output_zp_dup);
969 temp_val.val[2] =
970 vaddq_s32(vaddq_s32(temp_val.val[2], output_val_3), output_zp_dup);
971 temp_val.val[3] =
972 vaddq_s32(vaddq_s32(temp_val.val[3], output_val_4), output_zp_dup);
973
974 temp_val.val[0] =
975 vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
976 temp_val.val[1] =
977 vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
978 temp_val.val[2] =
979 vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
980 temp_val.val[3] =
981 vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
982
983 const int16x8_t result_1 =
984 vcombine_s16(vqmovn_s32(temp_val.val[0]), vqmovn_s32(temp_val.val[1]));
985 const int16x8_t result_2 =
986 vcombine_s16(vqmovn_s32(temp_val.val[2]), vqmovn_s32(temp_val.val[3]));
987 const int8x16_t result =
988 vcombine_s8(vqmovn_s16(result_1), vqmovn_s16(result_2));
989 vst1q_s8(output + i, result);
990 }
991 for (; TFLITE_UNLIKELY(i < total_size); ++i) {
992 int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
993 temp += output_zp;
994 temp += output[i];
995 if (temp > output_max) {
996 temp = output_max;
997 }
998 if (temp < output_min) {
999 temp = output_min;
1000 }
1001 output[i] = static_cast<int8_t>(temp);
1002 }
1003 }
1004
NeonCpuBackendGemm(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,CpuBackendContext * context)1005 void NeonCpuBackendGemm(const int8_t* input, const int32_t* bias,
1006 const int8_t* input_to_gate_weights, int32_t n_batch,
1007 int32_t n_input, int32_t n_output, int32_t output_zp,
1008 int32_t* scratch, CpuBackendContext* context) {
1009 using ::tflite::cpu_backend_gemm::Gemm;
1010 using ::tflite::cpu_backend_gemm::GemmParams;
1011 using ::tflite::cpu_backend_gemm::MatrixParams;
1012
1013 MatrixParams<int8_t> lhs_params;
1014 lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
1015 lhs_params.rows = n_output;
1016 lhs_params.cols = n_input;
1017 lhs_params.cache_policy = cpu_backend_gemm::CachePolicy::kCacheIfLargeSpeedup;
1018
1019 MatrixParams<int8_t> rhs_params;
1020 rhs_params.order = cpu_backend_gemm::Order::kColMajor;
1021 rhs_params.rows = n_input;
1022 rhs_params.cols = n_batch;
1023
1024 MatrixParams<int32_t> dst_params;
1025 dst_params.order = cpu_backend_gemm::Order::kColMajor;
1026 dst_params.rows = n_output;
1027 dst_params.cols = n_batch;
1028
1029 GemmParams<int32, int32> gemm_params;
1030 if (bias) {
1031 gemm_params.bias = bias;
1032 }
1033 cpu_backend_gemm::Gemm(lhs_params, input_to_gate_weights, rhs_params, input,
1034 dst_params, scratch, gemm_params, context);
1035 }
1036
NeonMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)1037 void NeonMatrixBatchVectorMultiplyAccumulate(
1038 const int8_t* input, const int32_t* bias,
1039 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
1040 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
1041 int32_t* scratch, int16_t* output, CpuBackendContext* context) {
1042 #ifdef TFLITE_WITH_RUY_GEMV
1043 NeonCpuBackendGemm(input, bias, input_to_gate_weights, n_batch, n_input,
1044 n_output, output_zp, scratch, context);
1045 #else
1046 NeonMatrixBatchVectorMultiplyImpl(input, bias, input_to_gate_weights, n_batch,
1047 n_input, n_output, output_zp, scratch);
1048 #endif
1049 NeonMatrixBatchVectorAccumulateImpl(multiplier, shift, n_batch, n_output,
1050 output_zp, scratch, output);
1051 }
1052
NeonMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)1053 void NeonMatrixBatchVectorMultiplyAccumulate(
1054 const int8_t* input, const int32_t* bias,
1055 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
1056 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
1057 int32_t* scratch, int8_t* output, CpuBackendContext* context) {
1058 #ifdef TFLITE_WITH_RUY_GEMV
1059 NeonCpuBackendGemm(input, bias, input_to_gate_weights, n_batch, n_input,
1060 n_output, output_zp, scratch, context);
1061 #else
1062 NeonMatrixBatchVectorMultiplyImpl(input, bias, input_to_gate_weights, n_batch,
1063 n_input, n_output, output_zp, scratch);
1064 #endif
1065 NeonMatrixBatchVectorAccumulateImpl(multiplier, shift, n_batch, n_output,
1066 output_zp, scratch, output);
1067 }
1068
NeonMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)1069 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
1070 const int m_rows, const int m_cols,
1071 const int8_t* __restrict__ vectors,
1072 const float* scaling_factors,
1073 int n_batch,
1074 float* __restrict__ result) {
1075 #ifdef __aarch64__
1076 if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
1077 m_rows >= n_batch) {
1078 if (n_batch % 4 == 0) {
1079 // Benchmarks suggest that it's always better to use the batch code
1080 // when we can, even on small matrices.
1081 DotprodMatrixBatchFourVectorMultiplyAccumulate(
1082 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
1083 return;
1084 } else if (n_batch >= 2 && m_rows * m_cols >= 128 * 128) {
1085 DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
1086 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
1087 return;
1088 }
1089 }
1090 #endif // __aarch64__
1091
1092 // Assuming *matrix is kNeonVectorAlignment-byte aligned, every row of the
1093 // matrix is also kNeonVectorAlignment-byte aligned as long as cols is a
1094 // multiple of kNeonVectorAlignment. The assumption is currently satisfied by
1095 // TFLite's 16-byte memory alignment scheme.
1096 //
1097 // Otherwise, we allocate an aligned memory block and set
1098 // a flag to later copy rows from matrix to the block
1099 // for aligned multiplication.
1100 bool unaligned = false;
1101 int8_t* aligned_row = nullptr;
1102 void* aligned_row_free = nullptr;
1103 if ((m_cols & (kNeonVectorAlignment - 1)) != 0) {
1104 unaligned = true;
1105 aligned_row =
1106 (int8_t*)aligned_alloc(kNeonVectorAlignment, m_cols, // NOLINT
1107 &aligned_row_free);
1108 }
1109 void* aligned_vec_free = nullptr;
1110 int8_t* aligned_vec =
1111 (int8_t*)aligned_alloc(kNeonVectorAlignment, m_cols, // NOLINT
1112 &aligned_vec_free);
1113
1114 // If m_cols is not at least kInt8ValuesPerNeonVector, we cannot use the main
1115 // vectorized loop, and we need to process sequentially. postamble_half_start
1116 // shows the start index where this should happen. Between postamble_start and
1117 // postamble_half_start we can still process kInt8ValuesPerNeonVector/2 in a
1118 // vectorized form.
1119 const int postamble_half_start =
1120 RoundDownVectors<kInt8ValuesPerNeonVector>(m_cols);
1121 const int postamble_start =
1122 RoundDownVectors<(kInt8ValuesPerNeonVector / 2)>(m_cols);
1123
1124 for (int batch = 0; batch < n_batch; ++batch) {
1125 const float batch_scaling_factor = scaling_factors[batch];
1126 // Copy the vector data to an aligned vector.
1127 memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
1128 // Compute dot-product for every column.
1129 for (int row = 0; row < m_rows; ++row) {
1130 // Get the address of the first element of the row.
1131 int8_t* row_ptr = (int8_t*)matrix + row * m_cols; // NOLINT
1132 if (unaligned) {
1133 memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
1134 row_ptr = aligned_row;
1135 }
1136
1137 // Initialize the dot product sum for the row to 0.
1138 int32x4_t dotprod_32x4 = vmovq_n_s32(0);
1139
1140 // Prefetch the row to cache.
1141 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
1142 3 /* temporal locality */);
1143
1144 // For every block of 16 8-bit elements.
1145 int col = 0;
1146 for (; col < postamble_half_start; col += kInt8ValuesPerNeonVector) {
1147 // Load 16 8-bit values from the row and vector, each, to operate on.
1148 // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
1149 // performance may suffer significantly.
1150 TFLITE_DCHECK_EQ( // NOLINT
1151 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
1152 const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
1153 const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
1154 // Multiply the low bits (i.e. the lower 8 8bit numbers in the
1155 // registers).
1156 int16x8_t prod_16x8 =
1157 vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
1158 // Multiply the high bits (i.e. the higher 8 8bit numbers in the
1159 // registers), and accumulate with the result of the low bits product.
1160 // The assumption here is that overflow will not happen as we quantize
1161 // our values to be in the range [-127, 127]. As such the sum of the 2
1162 // products is always strictly smaller than 15-bits (32767 in absolute
1163 // value).
1164 prod_16x8 =
1165 vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
1166
1167 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
1168 } // for col
1169
1170 // Half iteration dealing only 8 elements
1171 if (TFLITE_UNLIKELY(col < postamble_start)) {
1172 // Load 8 8-bit values from the row and column each to operate on.
1173 // Here the assumption is that each buffer is 4-bytes aligned.
1174 // Otherwise, performance may suffer significantly.
1175 TFLITE_DCHECK_EQ( // NOLINT
1176 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
1177 const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
1178 const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
1179 const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
1180 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
1181 col += (kInt8ValuesPerNeonVector >> 1);
1182 }
1183 // Add the 4 intermediate sum values to get the final dot-prod value for
1184 // this row.
1185 int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
1186 // Postamble loop.
1187 for (; TFLITE_UNLIKELY(col < m_cols); ++col) {
1188 dotprod += row_ptr[col] * aligned_vec[col];
1189 } // for col
1190
1191 *result += dotprod * batch_scaling_factor;
1192 ++result;
1193 } // for row
1194 } // for batch
1195
1196 if (unaligned) {
1197 free(aligned_row_free);
1198 }
1199 free(aligned_vec_free);
1200 }
1201
NeonMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,int32_t * scratch,float * __restrict__ result,CpuBackendContext * context)1202 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
1203 const int m_rows, const int m_cols,
1204 const int8_t* __restrict__ vectors,
1205 const float* scaling_factors,
1206 int n_batch, int32_t* scratch,
1207 float* __restrict__ result,
1208 CpuBackendContext* context) {
1209 if (m_rows % 4 == 0) {
1210 const int32_t* bias = static_cast<const int32_t*>(nullptr);
1211 NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
1212 /*output_zp =*/0, scratch, context);
1213
1214 // Multiply by float scaling factors and write to result
1215 const int total_size = n_batch * m_rows;
1216 int i = 0;
1217 for (; i <= total_size - 8; i += 8, result += 8) {
1218 const float batch_scaling_factor0 = scaling_factors[i / m_rows];
1219 const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
1220 const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
1221 const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
1222 const int32x4_t scratch_val0 = vld1q_s32(scratch + i);
1223 const int32x4_t scratch_val1 = vld1q_s32(scratch + i + 4);
1224 const float32x4_t float_val0 = vcvtq_f32_s32(scratch_val0);
1225 const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
1226 const float32x4_t result0 =
1227 vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
1228 const float32x4_t result1 =
1229 vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
1230 vst1q_f32(result, result0);
1231 vst1q_f32(result + 4, result1);
1232 }
1233 scratch += i;
1234 for (; TFLITE_UNLIKELY(i < total_size); i++) {
1235 const float batch_scaling_factor = scaling_factors[i / m_rows];
1236 int32_t x = *(scratch++);
1237 *result += x * batch_scaling_factor;
1238 ++result;
1239 }
1240 return;
1241 }
1242 NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
1243 scaling_factors, n_batch, result);
1244 }
1245
NeonMatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)1246 void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
1247 int32_t n_row, int32_t n_col,
1248 int32_t* output) {
1249 // Processing multiple rows at the same time actually makes it slower. :(
1250 for (int i = 0; i < n_row; ++i) {
1251 int32x4_t row_sum = vdupq_n_s32(0);
1252 int j = 0;
1253 const int8_t* row_ptr = matrix + i * n_col;
1254 for (; j <= n_col - kInt8ValuesPerNeonVector;
1255 j += kInt8ValuesPerNeonVector) {
1256 const int8x16_t input_value = vld1q_s8(row_ptr + j);
1257 int16x8_t temp = vmovl_s8(vget_low_s8(input_value));
1258 temp = vaddw_s8(temp, vget_high_s8(input_value));
1259 row_sum = vpadalq_s16(row_sum, temp);
1260 }
1261 int32_t sum = AccumulateNeonLane(row_sum);
1262 for (; TFLITE_UNLIKELY(j < n_col); ++j) {
1263 sum += *(row_ptr + j);
1264 }
1265 output[i] += sum * scalar;
1266 }
1267 }
1268
NeonMatrixBatchVectorMultiplyAccumulateImpl(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * row_sums)1269 void NeonMatrixBatchVectorMultiplyAccumulateImpl(
1270 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
1271 const int8_t* __restrict__ vectors, const float* scaling_factors,
1272 int n_batch, float* __restrict__ result, const float* per_channel_scale,
1273 const int32_t* input_offset, int32_t* row_sums) {
1274 #ifdef __aarch64__
1275 if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
1276 m_rows >= n_batch) {
1277 if (n_batch % 4 == 0) {
1278 DotprodMatrixBatchFourVectorMultiplyAccumulate(
1279 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
1280 per_channel_scale, input_offset, row_sums);
1281 return;
1282 } else if (n_batch >= 2 && m_rows * m_cols >= 128 * 128) {
1283 DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
1284 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
1285 per_channel_scale, input_offset, row_sums);
1286 return;
1287 }
1288 }
1289 #endif // __aarch64__
1290
1291 bool unaligned = false;
1292 int8_t* aligned_row = nullptr;
1293 void* aligned_row_free = nullptr;
1294 if ((m_cols & (kNeonVectorAlignment - 1)) != 0) {
1295 unaligned = true;
1296 aligned_row =
1297 (int8_t*)aligned_alloc(kNeonVectorAlignment, m_cols, // NOLINT
1298 &aligned_row_free);
1299 }
1300 void* aligned_vec_free = nullptr;
1301 int8_t* aligned_vec =
1302 (int8_t*)aligned_alloc(kNeonVectorAlignment, m_cols, // NOLINT
1303 &aligned_vec_free);
1304
1305 const int postamble_half_start =
1306 RoundDownVectors<kInt8ValuesPerNeonVector>(m_cols);
1307 const int postamble_start =
1308 RoundDownVectors<(kInt8ValuesPerNeonVector / 2)>(m_cols);
1309
1310 int32_t* row_sums_ptr = row_sums;
1311 if (row_sums == nullptr) {
1312 row_sums_ptr = static_cast<int32_t*>(malloc(sizeof(int32_t) * m_rows));
1313 NeonReductionSumVector(matrix, row_sums_ptr, m_rows, m_cols);
1314 }
1315
1316 for (int batch = 0; batch < n_batch; ++batch) {
1317 const float batch_scaling_factor = scaling_factors[batch];
1318 const int batch_input_offset = input_offset[batch];
1319 memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
1320 for (int row = 0; row < m_rows; ++row) {
1321 int8_t* row_ptr = (int8_t*)matrix + row * m_cols; // NOLINT
1322 if (unaligned) {
1323 memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
1324 row_ptr = aligned_row;
1325 }
1326 float scale = batch_scaling_factor;
1327 if (per_channel_scale) {
1328 scale *= per_channel_scale[row];
1329 }
1330 // Initialize the dot product sum for the row to 0.
1331 int32x4_t dotprod_32x4 = vmovq_n_s32(0);
1332
1333 // Prefetch the row to cache.
1334 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
1335 3 /* temporal locality */);
1336
1337 // For every block of 16 8-bit elements.
1338 int col = 0;
1339 for (; col < postamble_half_start; col += kInt8ValuesPerNeonVector) {
1340 // Load 16 8-bit values from the row and vector, each, to operate on.
1341 // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
1342 // performance may suffer significantly.
1343 TFLITE_DCHECK_EQ( // NOLINT
1344 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
1345 const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
1346 const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
1347 // Multiply the low bits (i.e. the lower 8 8bit numbers in the
1348 // registers).
1349 int16x8_t prod_16x8 =
1350 vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
1351 // Multiply the high bits (i.e. the higher 8 8bit numbers in the
1352 // registers), and accumulate with the result of the low bits product.
1353 // The assumption here is that overflow will not happen as we quantize
1354 // our values to be in the range [-127, 127]. As such the sum of the 2
1355 // products is always strictly smaller than 15-bits (32767 in absolute
1356 // value).
1357 prod_16x8 =
1358 vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
1359 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
1360 } // for col
1361
1362 // Half iteration dealing only 8 elements
1363 if (TFLITE_UNLIKELY(col < postamble_start)) {
1364 // Load 8 8-bit values from the row and column each to operate on.
1365 // Here the assumption is that each buffer is 4-bytes aligned.
1366 // Otherwise, performance may suffer significantly.
1367 TFLITE_DCHECK_EQ( // NOLINT
1368 (uintptr_t)(&row_ptr[col]) & (kNeonVectorAlignment - 1), 0);
1369 const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
1370 const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
1371 const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
1372 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
1373 col += (kInt8ValuesPerNeonVector >> 1);
1374 }
1375
1376 int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
1377
1378 // Postamble loop.
1379 for (; TFLITE_UNLIKELY(col < m_cols); ++col) {
1380 dotprod += row_ptr[col] * aligned_vec[col];
1381 } // for col
1382 dotprod -= row_sums_ptr[row] * batch_input_offset;
1383 *result += dotprod * scale;
1384 ++result;
1385 } // for row
1386 } // for batch
1387
1388 if (row_sums == nullptr) {
1389 free(row_sums_ptr);
1390 }
1391 if (unaligned) {
1392 free(aligned_row_free);
1393 }
1394 free(aligned_vec_free);
1395 }
1396
NeonMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)1397 void NeonMatrixBatchVectorMultiplyAccumulate(
1398 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
1399 const int8_t* __restrict__ vectors, const float* scaling_factors,
1400 int n_batch, float* __restrict__ result, const float* per_channel_scale,
1401 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
1402 bool* compute_row_sums, CpuBackendContext* context) {
1403 const bool use_cpu_backend_gemm = (context && context->use_caching()) ||
1404 UseCpuBackendGemm(m_rows, m_cols, n_batch);
1405 if (input_offset == nullptr) {
1406 if (use_cpu_backend_gemm && context) {
1407 NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
1408 scaling_factors, n_batch, scratch,
1409 result, context);
1410 return;
1411 }
1412 NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
1413 scaling_factors, n_batch, result);
1414 return;
1415 }
1416
1417 if (compute_row_sums == nullptr || *compute_row_sums) {
1418 NeonReductionSumVector(matrix, row_sums, m_rows, m_cols);
1419 if (compute_row_sums) {
1420 *compute_row_sums = false;
1421 }
1422 }
1423
1424 if (use_cpu_backend_gemm) {
1425 if (context != nullptr && m_rows % 4 == 0) {
1426 const int32_t* bias = static_cast<const int32_t*>(nullptr);
1427 NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, 0,
1428 scratch, context);
1429
1430 // Multiply by float scaling factors and write to result
1431 const int total_size = n_batch * m_rows;
1432 int i = 0;
1433 int32_t* scratch_ptr = scratch;
1434 for (; i <= total_size - 8; i += 8, result += 8) {
1435 const float batch_scaling_factor0 = scaling_factors[i / m_rows];
1436 const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
1437 const int batch_input_offset0 = -input_offset[i / m_rows];
1438 const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
1439 float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
1440 float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
1441 if (per_channel_scale) {
1442 const float32x4_t per_channel_scale0 =
1443 vld1q_f32(&per_channel_scale[i % m_rows]);
1444 const float32x4_t per_channel_scale1 =
1445 vld1q_f32(&per_channel_scale[(i + 4) % m_rows]);
1446 scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0);
1447 scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1);
1448 }
1449 const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
1450 const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
1451 const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
1452 const int32x4_t row_sum1 = vld1q_s32(row_sums + ((i + 4) % m_rows));
1453 const int32x4_t scratch_val0 = vld1q_s32(scratch_ptr + i);
1454 const int32x4_t scratch_val1 = vld1q_s32(scratch_ptr + i + 4);
1455 const int32x4_t dotprod0 =
1456 vmlaq_s32(scratch_val0, row_sum0, input_offset0);
1457 const int32x4_t dotprod1 =
1458 vmlaq_s32(scratch_val1, row_sum1, input_offset1);
1459 const float32x4_t float_val0 = vcvtq_f32_s32(dotprod0);
1460 const float32x4_t float_val1 = vcvtq_f32_s32(dotprod1);
1461 const float32x4_t result0 =
1462 vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
1463 const float32x4_t result1 =
1464 vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
1465 vst1q_f32(result, result0);
1466 vst1q_f32(result + 4, result1);
1467 }
1468
1469 scratch_ptr += i;
1470 for (; TFLITE_UNLIKELY(i < total_size); i++) {
1471 float batch_scaling_factor = scaling_factors[i / m_rows];
1472 if (per_channel_scale) {
1473 batch_scaling_factor *= per_channel_scale[i % m_rows];
1474 }
1475 const int32_t zero_point = input_offset[i / m_rows];
1476 int32_t dotprod = *(scratch_ptr++);
1477 dotprod -= row_sums[i % m_rows] * zero_point;
1478 *result += dotprod * batch_scaling_factor;
1479 ++result;
1480 }
1481 return;
1482 }
1483 }
1484
1485 NeonMatrixBatchVectorMultiplyAccumulateImpl(
1486 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
1487 per_channel_scale, input_offset, row_sums);
1488 }
1489
MulAdd(int32x4_t acc,int32x4_t lhs,int32x4_t rhs)1490 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
1491 int64x2x2_t result;
1492 const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
1493 const int64x2_t lhs_high = vmovl_s32(vget_high_s32(lhs));
1494 const int64_t lhs_0 = vgetq_lane_s64(lhs_low, 0);
1495 const int64_t lhs_1 = vgetq_lane_s64(lhs_low, 1);
1496 const int64_t lhs_2 = vgetq_lane_s64(lhs_high, 0);
1497 const int64_t lhs_3 = vgetq_lane_s64(lhs_high, 1);
1498
1499 const int64x2_t rhs_low = vmovl_s32(vget_low_s32(rhs));
1500 const int64x2_t rhs_high = vmovl_s32(vget_high_s32(rhs));
1501 const int64_t rhs_0 = vgetq_lane_s64(rhs_low, 0);
1502 const int64_t rhs_1 = vgetq_lane_s64(rhs_low, 1);
1503 const int64_t rhs_2 = vgetq_lane_s64(rhs_high, 0);
1504 const int64_t rhs_3 = vgetq_lane_s64(rhs_high, 1);
1505
1506 const int64x2_t mul_0 = {lhs_0 * rhs_0, lhs_1 * rhs_1};
1507 const int64x2_t mul_1 = {lhs_2 * rhs_2, lhs_3 * rhs_3};
1508
1509 result.val[0] = vaddq_s64(vmovl_s32(vget_low_s32(acc)), mul_0);
1510 result.val[1] = vaddq_s64(vmovl_s32(vget_high_s32(acc)), mul_1);
1511 return result;
1512 }
1513
NeonApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)1514 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
1515 const int32_t* bias, int32_t layer_norm_scale_a,
1516 int32_t layer_norm_scale_b, int32_t variance_limit,
1517 int n_batch, int n_input, int16_t* output) {
1518 const int32 int16_max = std::numeric_limits<int16>::max();
1519 const int32 int16_min = std::numeric_limits<int16>::min();
1520 const int32 temp = 1048576 / n_input;
1521
1522 for (int i = 0; i < n_batch; ++i) {
1523 int64_t sum = 0;
1524 int64_t sum_sq = 0;
1525
1526 int j = 0;
1527 for (; j <= n_input - 8; j += 8) {
1528 const int32 index = i * n_input + j;
1529 const int16x8_t val_s16 = vld1q_s16(input + index);
1530 const int32x4_t val_s32_0 = vmovl_s16(vget_low_s16(val_s16));
1531 const int32x4_t val_s32_1 = vmovl_s16(vget_high_s16(val_s16));
1532
1533 sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_0));
1534 sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_1));
1535
1536 sum_sq += static_cast<int64_t>(
1537 AccumulateNeonLane(vmulq_s32(val_s32_0, val_s32_0)));
1538 sum_sq += static_cast<int64_t>(
1539 AccumulateNeonLane(vmulq_s32(val_s32_1, val_s32_1)));
1540 }
1541 for (; TFLITE_UNLIKELY(j < n_input); ++j) {
1542 const int32 index = i * n_input + j;
1543 int32 val = static_cast<int32_t>(input[index]);
1544 sum += val;
1545 sum_sq += val * val;
1546 }
1547
1548 // Divide by `n_input` first to avoid overflow but only works for POT
1549 // `n_input`.
1550 int32_t mean =
1551 static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
1552 int64_t variance =
1553 sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
1554 int32_t variance2 = static_cast<int32>(variance / 1048576);
1555 if (variance2 < 1) {
1556 variance2 = variance_limit;
1557 }
1558 int32_t stddev_inverse_a;
1559 int stddev_inverse_b;
1560 GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
1561 &stddev_inverse_a, &stddev_inverse_b);
1562
1563 j = 0;
1564 const int32x4_t mean_dup = vdupq_n_s32(mean);
1565 for (; j <= n_input - 16; j += 16) {
1566 // Load 16 items at once.
1567 const int32 index = i * n_input + j;
1568 const int16x8_t val_s16_0 = vld1q_s16(input + index);
1569 const int16x8_t val_s16_1 = vld1q_s16(input + index + 8);
1570
1571 int32x4x4_t shifted;
1572 shifted.val[0] = vsubq_s32(
1573 vshlq_n_s32(vmovl_s16(vget_low_s16(val_s16_0)), 10), mean_dup);
1574 shifted.val[1] = vsubq_s32(
1575 vshlq_n_s32(vmovl_s16(vget_high_s16(val_s16_0)), 10), mean_dup);
1576 shifted.val[2] = vsubq_s32(
1577 vshlq_n_s32(vmovl_s16(vget_low_s16(val_s16_1)), 10), mean_dup);
1578 shifted.val[3] = vsubq_s32(
1579 vshlq_n_s32(vmovl_s16(vget_high_s16(val_s16_1)), 10), mean_dup);
1580
1581 int32x4x4_t rescaled = MultiplyByQuantizedMultiplier4Rows(
1582 shifted, stddev_inverse_a, stddev_inverse_b);
1583
1584 const int32x4_t bias_0 = vld1q_s32(bias + j);
1585 const int32x4_t bias_1 = vld1q_s32(bias + j + 4);
1586 const int32x4_t bias_2 = vld1q_s32(bias + j + 8);
1587 const int32x4_t bias_3 = vld1q_s32(bias + j + 12);
1588
1589 const int16x8_t layer_norm_weights_s16_0 =
1590 vld1q_s16(layer_norm_weights + j);
1591 const int16x8_t layer_norm_weights_s16_1 =
1592 vld1q_s16(layer_norm_weights + j + 8);
1593 const int32x4_t layer_norm_weights_s32_0 =
1594 vmovl_s16(vget_low_s16(layer_norm_weights_s16_0));
1595 const int32x4_t layer_norm_weights_s32_1 =
1596 vmovl_s16(vget_high_s16(layer_norm_weights_s16_0));
1597 const int32x4_t layer_norm_weights_s32_2 =
1598 vmovl_s16(vget_low_s16(layer_norm_weights_s16_1));
1599 const int32x4_t layer_norm_weights_s32_3 =
1600 vmovl_s16(vget_high_s16(layer_norm_weights_s16_1));
1601
1602 int64x2x2_t val3_0 =
1603 MulAdd(bias_0, rescaled.val[0], layer_norm_weights_s32_0);
1604 int64x2x2_t val3_1 =
1605 MulAdd(bias_1, rescaled.val[1], layer_norm_weights_s32_1);
1606 int64x2x2_t val3_2 =
1607 MulAdd(bias_2, rescaled.val[2], layer_norm_weights_s32_2);
1608 int64x2x2_t val3_3 =
1609 MulAdd(bias_3, rescaled.val[3], layer_norm_weights_s32_3);
1610
1611 int32x4x4_t val4;
1612 val4.val[0] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_0.val[0], 10)),
1613 vmovn_s64(vrshrq_n_s64(val3_0.val[1], 10)));
1614 val4.val[1] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_1.val[0], 10)),
1615 vmovn_s64(vrshrq_n_s64(val3_1.val[1], 10)));
1616 val4.val[2] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_2.val[0], 10)),
1617 vmovn_s64(vrshrq_n_s64(val3_2.val[1], 10)));
1618 val4.val[3] = vcombine_s32(vmovn_s64(vrshrq_n_s64(val3_3.val[0], 10)),
1619 vmovn_s64(vrshrq_n_s64(val3_3.val[1], 10)));
1620
1621 int32x4x4_t val5_s32 = MultiplyByQuantizedMultiplier4Rows(
1622 val4, layer_norm_scale_a, layer_norm_scale_b + 12);
1623 vst1_s16(output + index, vqmovn_s32(val5_s32.val[0]));
1624 vst1_s16(output + index + 4, vqmovn_s32(val5_s32.val[1]));
1625 vst1_s16(output + index + 8, vqmovn_s32(val5_s32.val[2]));
1626 vst1_s16(output + index + 12, vqmovn_s32(val5_s32.val[3]));
1627 }
1628 for (; TFLITE_UNLIKELY(j < n_input); ++j) {
1629 const int32 index = i * n_input + j;
1630 int32 val = static_cast<int32_t>(input[index]);
1631 int32 shifted = 1024 * val - mean;
1632 int32 rescaled = MultiplyByQuantizedMultiplier(shifted, stddev_inverse_a,
1633 stddev_inverse_b);
1634 // TODO(jianlijianli): Saturate this.
1635 int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
1636 int32 val4 =
1637 static_cast<int32>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
1638 int32 val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
1639 layer_norm_scale_b + 12);
1640 val5 = std::min(std::max(int16_min, val5), int16_max);
1641 output[index] = static_cast<int16_t>(val5);
1642 }
1643 }
1644 }
1645
NeonApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)1646 void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
1647 int16_t* output) {
1648 for (int batch = 0; batch < n_batch; ++batch) {
1649 int i = 0;
1650 #ifdef GEMMLOWP_NEON
1651 // F0 uses 0 integer bits, range [-1, 1].
1652 // This is the return type of math functions such as tanh, logistic,
1653 // whose range is in [-1, 1].
1654 using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
1655 // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
1656 using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
1657
1658 for (; i <= n_input - 32; i += 32) {
1659 const int index = batch * n_input + i;
1660 F3 input0 = F3::FromRaw(vld1q_s16(input + index));
1661 F3 input1 = F3::FromRaw(vld1q_s16(input + index + 8));
1662 F3 input2 = F3::FromRaw(vld1q_s16(input + index + 16));
1663 F3 input3 = F3::FromRaw(vld1q_s16(input + index + 24));
1664 F0 output0 = gemmlowp::logistic(input0);
1665 F0 output1 = gemmlowp::logistic(input1);
1666 F0 output2 = gemmlowp::logistic(input2);
1667 F0 output3 = gemmlowp::logistic(input3);
1668 vst1q_s16(output + index, output0.raw());
1669 vst1q_s16(output + index + 8, output1.raw());
1670 vst1q_s16(output + index + 16, output2.raw());
1671 vst1q_s16(output + index + 24, output3.raw());
1672 }
1673 #endif // GEMMLOWP_NEON
1674 using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
1675 using F3_Scalar = gemmlowp::FixedPoint<int16_t, 3>;
1676 for (; i < n_input; ++i) {
1677 const int index = batch * n_input + i;
1678 F3_Scalar input_f3 = F3_Scalar::FromRaw(input[index]);
1679 F0_Scalar output_f0 = gemmlowp::logistic(input_f3);
1680 output[index] = output_f0.raw();
1681 }
1682 }
1683 }
1684
1685 template <int IntegerBits>
NeonApplyTanhImpl(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)1686 void NeonApplyTanhImpl(const int16_t* input, int32_t n_batch, int32_t n_input,
1687 int16_t* output) {
1688 for (int batch = 0; batch < n_batch; ++batch) {
1689 int i = 0;
1690 #ifdef GEMMLOWP_NEON
1691 // F0 uses 0 integer bits, range [-1, 1].
1692 // This is the return type of math functions such as tanh, logistic,
1693 // whose range is in [-1, 1].
1694 using F_In = gemmlowp::FixedPoint<int16x8_t, IntegerBits>;
1695 using F_Out = gemmlowp::FixedPoint<int16x8_t, 0>;
1696
1697 for (; i <= n_input - 32; i += 32) {
1698 const int index = batch * n_input + i;
1699 F_In input0 = F_In::FromRaw(vld1q_s16(input + index));
1700 F_In input1 = F_In::FromRaw(vld1q_s16(input + index + 8));
1701 F_In input2 = F_In::FromRaw(vld1q_s16(input + index + 16));
1702 F_In input3 = F_In::FromRaw(vld1q_s16(input + index + 24));
1703 F_Out output0 = gemmlowp::tanh(input0);
1704 F_Out output1 = gemmlowp::tanh(input1);
1705 F_Out output2 = gemmlowp::tanh(input2);
1706 F_Out output3 = gemmlowp::tanh(input3);
1707 vst1q_s16(output + index, output0.raw());
1708 vst1q_s16(output + index + 8, output1.raw());
1709 vst1q_s16(output + index + 16, output2.raw());
1710 vst1q_s16(output + index + 24, output3.raw());
1711 }
1712 #endif // GEMMLOWP_NEON
1713 using F_In_Scalar = gemmlowp::FixedPoint<int16_t, IntegerBits>;
1714 using F_Out_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
1715 for (; i < n_input; ++i) {
1716 const int index = batch * n_input + i;
1717 F_In_Scalar input_in = F_In_Scalar::FromRaw(input[index]);
1718 F_Out_Scalar output_out = gemmlowp::tanh(input_in);
1719 output[index] = output_out.raw();
1720 }
1721 }
1722 }
1723
NeonApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)1724 void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
1725 int32_t n_input, int16_t* output) {
1726 assert(integer_bits <= 6);
1727 #define DISPATCH_TANH(i) \
1728 case i: \
1729 NeonApplyTanhImpl<i>(input, n_batch, n_input, output); \
1730 break;
1731 switch (integer_bits) {
1732 DISPATCH_TANH(0);
1733 DISPATCH_TANH(1);
1734 DISPATCH_TANH(2);
1735 DISPATCH_TANH(3);
1736 DISPATCH_TANH(4);
1737 DISPATCH_TANH(5);
1738 DISPATCH_TANH(6);
1739 default:
1740 return;
1741 }
1742 #undef DISPATCH_TANH
1743 }
1744
NeonCwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)1745 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
1746 int n_input, int shift, int16_t* output) {
1747 for (int batch = 0; batch < n_batch; ++batch) {
1748 int i = 0;
1749 for (; i <= n_input - 8; i += 8) {
1750 const int index = batch * n_input + i;
1751 const int16x8_t a = vld1q_s16(input_1 + index);
1752 const int16x8_t b = vld1q_s16(input_2 + index);
1753 const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
1754 const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
1755 const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
1756 const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
1757
1758 int32x4_t x_0 = vmulq_s32(a_s32_0, b_s32_0);
1759 int32x4_t x_1 = vmulq_s32(a_s32_1, b_s32_1);
1760 x_0 = gemmlowp::RoundingDivideByPOT(x_0, shift);
1761 x_1 = gemmlowp::RoundingDivideByPOT(x_1, shift);
1762
1763 const int16x8_t result = vcombine_s16(vmovn_s32(x_0), vmovn_s32(x_1));
1764 vst1q_s16(output + index, result);
1765 }
1766 for (; TFLITE_UNLIKELY(i < n_input); ++i) {
1767 const int index = batch * n_input + i;
1768 const int16_t a = input_1[index];
1769 const int16_t b = input_2[index];
1770 int64_t x = a * b;
1771 if (x > std::numeric_limits<std::int32_t>::max()) {
1772 x = std::numeric_limits<std::int32_t>::max();
1773 }
1774 const int32_t value = static_cast<int32_t>(x);
1775 output[index] =
1776 static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
1777 }
1778 }
1779 }
1780
NeonCwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int shift,int n_batch,int n_input,int32_t output_zp,int8_t * output)1781 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
1782 int32_t multiplier, int shift, int n_batch, int n_input,
1783 int32_t output_zp, int8_t* output) {
1784 const int32_t output_min = std::numeric_limits<int8_t>::min();
1785 const int32_t output_max = std::numeric_limits<int8_t>::max();
1786
1787 const int32x4_t output_zp_dup = vdupq_n_s32(-output_zp);
1788 const int32x4_t max_val_dup = vdupq_n_s32(output_max);
1789 const int32x4_t min_val_dup = vdupq_n_s32(output_min);
1790
1791 for (int batch = 0; batch < n_batch; ++batch) {
1792 int i = 0;
1793 for (; i <= n_input - 8; i += 8) {
1794 const int index = batch * n_input + i;
1795 const int16x8_t a = vld1q_s16(input_1 + index);
1796 const int16x8_t b = vld1q_s16(input_2 + index);
1797 const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
1798 const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
1799 const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
1800 const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
1801
1802 int32x4x2_t temp_val;
1803 temp_val.val[0] = vmulq_s32(a_s32_0, b_s32_0);
1804 temp_val.val[1] = vmulq_s32(a_s32_1, b_s32_1);
1805 temp_val =
1806 MultiplyByQuantizedMultiplier2Rows(temp_val, multiplier, shift);
1807
1808 temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
1809 temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
1810 temp_val.val[0] =
1811 vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
1812 temp_val.val[1] =
1813 vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
1814
1815 const int16x8_t result =
1816 vcombine_s16(vmovn_s32(temp_val.val[0]), vmovn_s32(temp_val.val[1]));
1817 vst1_s8(output + index, vmovn_s16(result));
1818 }
1819 for (; TFLITE_UNLIKELY(i < n_input); ++i) {
1820 const int index = batch * n_input + i;
1821 const int16_t a = input_1[index];
1822 const int16_t b = input_2[index];
1823 int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
1824 value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
1825 value -= output_zp;
1826 value = std::min(std::max(-128, value), 127);
1827
1828 output[index] = static_cast<int8>(value);
1829 }
1830 }
1831 }
1832
NeonCwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)1833 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
1834 int n_input, int16_t* output) {
1835 const int32 int16_max = std::numeric_limits<int16>::max();
1836 const int32 int16_min = std::numeric_limits<int16>::min();
1837 for (int batch = 0; batch < n_batch; ++batch) {
1838 int i = 0;
1839 for (; i <= n_input - 8; i += 8) {
1840 const int index = batch * n_input + i;
1841 const int16x8_t a = vld1q_s16(input_1 + index);
1842 const int16x8_t b = vld1q_s16(input_2 + index);
1843 const int32x4_t a_s32_0 = vmovl_s16(vget_low_s16(a));
1844 const int32x4_t a_s32_1 = vmovl_s16(vget_high_s16(a));
1845 const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
1846 const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
1847
1848 const int32x4_t sum_0 = vaddq_s32(a_s32_0, b_s32_0);
1849 const int32x4_t sum_1 = vaddq_s32(a_s32_1, b_s32_1);
1850 vst1_s16(output + index, vqmovn_s32(sum_0));
1851 vst1_s16(output + index + 4, vqmovn_s32(sum_1));
1852 }
1853 for (; TFLITE_UNLIKELY(i < n_input); ++i) {
1854 const int index = batch * n_input + i;
1855 int32_t sum = input_1[index] + input_2[index];
1856 const int32 sum_clamped = std::min(int16_max, std::max(int16_min, sum));
1857 output[index] = static_cast<int16_t>(sum_clamped);
1858 }
1859 }
1860 }
1861
NeonCwiseClipping(float * vector,const int v_size,const float clipping_value)1862 void NeonCwiseClipping(float* vector, const int v_size,
1863 const float clipping_value) {
1864 const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
1865 const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
1866
1867 int i = 0;
1868 for (; i <= v_size - kFloatValuesPerNeonVector;
1869 i += kFloatValuesPerNeonVector) {
1870 // Load from memory to vector.
1871 float32x4_t v_f32x4 = vld1q_f32(vector + i);
1872 // Clip between clipping_value and -clipping_value.
1873 v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
1874 v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
1875 // Save to output.
1876 vst1q_f32(vector + i, v_f32x4);
1877 }
1878 for (; TFLITE_UNLIKELY(i < v_size); i++) {
1879 vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
1880 }
1881 }
1882
NeonCwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)1883 void NeonCwiseClipping(int16_t* vector, const int v_size,
1884 const int16_t clipping_value) {
1885 const int16x8_t max_dup = vdupq_n_s16(clipping_value);
1886 const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
1887
1888 int i = 0;
1889 for (; i <= v_size - kInt16ValuesPerNeonVector * 2;
1890 i += kInt16ValuesPerNeonVector * 2) {
1891 int16x8_t val_0 = vld1q_s16(vector + i);
1892 int16x8_t val_1 = vld1q_s16(vector + i + kInt16ValuesPerNeonVector);
1893 val_0 = vminq_s16(val_0, max_dup);
1894 val_1 = vminq_s16(val_1, max_dup);
1895 val_0 = vmaxq_s16(val_0, min_dup);
1896 val_1 = vmaxq_s16(val_1, min_dup);
1897 vst1q_s16(vector + i, val_0);
1898 vst1q_s16(vector + i + kInt16ValuesPerNeonVector, val_1);
1899 }
1900 for (; TFLITE_UNLIKELY(i < v_size); i++) {
1901 vector[i] = std::max(std::min(clipping_value, vector[i]),
1902 static_cast<int16_t>(-clipping_value));
1903 }
1904 }
1905
NeonCwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)1906 void NeonCwiseClipping(int8_t* vector, const int v_size,
1907 const int8_t clipping_value) {
1908 const int8x16_t max_dup = vdupq_n_s8(clipping_value);
1909 const int8x16_t min_dup = vdupq_n_s8(-clipping_value);
1910
1911 int i = 0;
1912 for (; i < v_size - kInt8ValuesPerNeonVector * 2;
1913 i += kInt8ValuesPerNeonVector * 2) {
1914 int8x16_t val_0 = vld1q_s8(vector + i);
1915 int8x16_t val_1 = vld1q_s8(vector + i + kInt8ValuesPerNeonVector);
1916 val_0 = vminq_s8(val_0, max_dup);
1917 val_1 = vminq_s8(val_1, max_dup);
1918 val_0 = vmaxq_s8(val_0, min_dup);
1919 val_1 = vmaxq_s8(val_1, min_dup);
1920 vst1q_s8(vector + i, val_0);
1921 vst1q_s8(vector + i + kInt8ValuesPerNeonVector, val_1);
1922 }
1923 for (; TFLITE_UNLIKELY(i < v_size); i++) {
1924 vector[i] = std::max(std::min(clipping_value, vector[i]),
1925 static_cast<int8_t>(-clipping_value));
1926 }
1927 }
1928
NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)1929 void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(
1930 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
1931 const int32_t* __restrict__ indices, int m_rows, int m_cols,
1932 const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
1933 constexpr int kBlockSize = kFloatValuesPerNeonVector;
1934 TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
1935
1936 for (int batch = 0; batch < n_batch; batch++) {
1937 const float* matrix_ptr = matrix;
1938 for (int row = 0; row < m_rows; row++) {
1939 float32x4_t acc_32x4 = vmovq_n_f32(0.0);
1940 const float* vector_in_batch = vector + batch * m_cols;
1941
1942 for (int i = segments[row]; i < segments[row + 1]; i++) {
1943 const int block_start_index = indices[i] * kBlockSize;
1944 const float* vector_block_in_batch_ptr =
1945 vector_in_batch + block_start_index;
1946
1947 // Load 4 float values from the vector and matrix row.
1948 float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr);
1949 float32x4_t matrix_f32x4 = vld1q_f32(matrix_ptr);
1950 // Multiply the vector and matrix row and add to accumulator.
1951 acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
1952 matrix_ptr += kBlockSize;
1953 }
1954 result[batch * m_rows + row] += AccumulateNeonLane(acc_32x4);
1955 }
1956 }
1957 }
1958
NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)1959 void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(
1960 const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
1961 const int32_t* __restrict__ indices, int m_rows, int m_cols,
1962 const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
1963 int n_batch, const int32_t input_offset, const int32_t output_multiplier,
1964 const int32_t output_shift, const int32_t output_offset,
1965 const int32_t output_activation_min, const int32_t output_activation_max,
1966 int8_t* __restrict__ result) {
1967 constexpr int kBlockSize = kInt8ValuesPerNeonVector;
1968 TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
1969
1970 for (int batch = 0; batch < n_batch; ++batch) {
1971 const int8_t* matrix_ptr = matrix;
1972 for (int row = 0; row < m_rows; ++row) {
1973 // Accumulation loop.
1974 int32x4_t acc_i32x4 = vmovq_n_s32(0);
1975 int32_t matrix_row_sum = 0;
1976 const int8_t* vector_in_batch = vector + batch * m_cols;
1977
1978 for (int i = segments[row]; i < segments[row + 1]; ++i) {
1979 const int block_start_index = indices[i] * kBlockSize;
1980 const int8_t* vector_block_in_batch_ptr =
1981 vector_in_batch + block_start_index;
1982
1983 // Load 16 int8 values from the vector and matrix row.
1984 int8x16_t vector_i8x16 = vld1q_s8(vector_block_in_batch_ptr);
1985 int8x16_t matrix_i8x16 = vld1q_s8(matrix_ptr);
1986 #ifdef __aarch64__
1987 int16_t matrix_block_sum = vaddlvq_s8(matrix_i8x16);
1988 #else
1989 int16_t matrix_block_sum =
1990 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 0)) +
1991 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 1)) +
1992 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 2)) +
1993 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 3)) +
1994 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 4)) +
1995 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 5)) +
1996 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 6)) +
1997 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 7)) +
1998 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 8)) +
1999 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 9)) +
2000 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 10)) +
2001 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 11)) +
2002 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 12)) +
2003 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 13)) +
2004 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 14)) +
2005 static_cast<int8_t>(vgetq_lane_s8(matrix_i8x16, 15));
2006 #endif
2007
2008 // Multiply the vector and matrix row and add to accumulator.
2009 int16x8_t acc_i16x8 =
2010 vmull_s8(vget_low_s8(vector_i8x16), vget_low_s8(matrix_i8x16));
2011 acc_i16x8 = vmlal_s8(acc_i16x8, vget_high_s8(vector_i8x16),
2012 vget_high_s8(matrix_i8x16));
2013 acc_i32x4 = vpadalq_s16(acc_i32x4, acc_i16x8);
2014 matrix_row_sum += matrix_block_sum;
2015 matrix_ptr += kBlockSize;
2016 }
2017 #ifdef __aarch64__
2018 int32_t acc = vaddvq_s32(acc_i32x4);
2019 #else
2020 int32_t acc = vgetq_lane_s32(acc_i32x4, 0) +
2021 vgetq_lane_s32(acc_i32x4, 1) +
2022 vgetq_lane_s32(acc_i32x4, 2) + vgetq_lane_s32(acc_i32x4, 3);
2023 #endif
2024 const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
2025 acc = acc + bias_value + input_offset * matrix_row_sum;
2026 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2027 acc += output_offset;
2028 result[batch * m_rows + row] =
2029 static_cast<int8_t>(ActivationFunctionWithMinMax(
2030 acc, output_activation_min, output_activation_max));
2031 }
2032 }
2033 }
2034
NeonSparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)2035 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
2036 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
2037 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
2038 float* __restrict__ result) {
2039 constexpr int kNeonVectorsPerBlock = 4;
2040 constexpr int kBlockSize = kNeonVectorsPerBlock * kFloatValuesPerNeonVector;
2041 TFLITE_DCHECK_EQ( // NOLINT
2042 m_cols % kBlockSize, 0);
2043
2044 for (int batch = 0; batch < n_batch; batch++) {
2045 const float* matrix_ptr = matrix;
2046 const uint8_t* ledger_ptr = ledger;
2047 for (int row = 0; row < m_rows; row++) {
2048 int num_nonzero_blocks = *ledger_ptr++;
2049 if (num_nonzero_blocks > 0) {
2050 float32x4_t acc_32x4 = vmovq_n_f32(0.0);
2051 const float* vector_in_batch = vector + batch * m_cols;
2052
2053 for (int i = 0; i < num_nonzero_blocks; i++) {
2054 const int block_start_index = *ledger_ptr++ * kBlockSize;
2055 const float* vector_block_in_batch_ptr =
2056 vector_in_batch + block_start_index;
2057
2058 for (int c = 0; c < kNeonVectorsPerBlock; c++) {
2059 // Load 4 float values from the vector and matrix row.
2060 float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
2061 c * kFloatValuesPerNeonVector);
2062 float32x4_t matrix_f32x4 =
2063 vld1q_f32(matrix_ptr + c * kFloatValuesPerNeonVector);
2064 // Multiply the vector and matrix row and add to accumulator.
2065 acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
2066 }
2067 matrix_ptr += kBlockSize;
2068 }
2069 result[batch * m_rows + row] += AccumulateNeonLane(acc_32x4);
2070 }
2071 }
2072 }
2073 }
2074
NeonSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)2075 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
2076 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
2077 const int m_cols, const int8_t* __restrict__ vectors,
2078 const float* scaling_factors, int n_batch, float* __restrict__ result) {
2079 #ifdef __aarch64__
2080 if (HasSdotInstruction() && m_cols % 16 == 0) {
2081 DotprodSparseMatrixBatchVectorMultiplyAccumulate(
2082 matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
2083 result);
2084 return;
2085 }
2086 #endif // __aarch64__
2087
2088 constexpr int kBlockSize = kInt8ValuesPerNeonVector;
2089 TFLITE_DCHECK_EQ( // NOLINT
2090 m_cols % kBlockSize, 0);
2091 void* aligned_vec_free = nullptr;
2092 int8_t* aligned_vec =
2093 (int8_t*)aligned_alloc(kNeonVectorAlignment, m_cols, // NOLINT
2094 &aligned_vec_free);
2095
2096 for (int batch = 0; batch < n_batch; ++batch) {
2097 const float batch_scaling_factor = scaling_factors[batch];
2098 // Copy the vector data to an aligned vector.
2099 memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
2100
2101 const uint8_t* ledger_ptr = ledger;
2102 const int8_t* row_ptr = matrix;
2103 for (int row = 0; row < m_rows; ++row) {
2104 // Initialize the dot product sum for the row to 0.
2105 int32x4_t dotprod_32x4 = vmovq_n_s32(0);
2106 int num_nonzero_blocks = *ledger_ptr++;
2107 if (num_nonzero_blocks > 0) {
2108 // Prefetch the row to cache.
2109 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
2110 3 /* temporal locality */);
2111 for (int i = 0; i < num_nonzero_blocks; i++) {
2112 const int col_index = *ledger_ptr++ * kBlockSize;
2113 // Load 16 8-bit values from the row and vector, each, to operate on.
2114 // Here the assumption is that each buffer is 4-byte aligned.
2115 // Otherwise, performance may suffer significantly.
2116 TFLITE_DCHECK_EQ( // NOLINT
2117 (uintptr_t)(&row_ptr) & (kNeonVectorAlignment - 1), 0);
2118 const int8x16_t s1_8x16 =
2119 vld1q_s8((const int8_t*)(aligned_vec + col_index));
2120 const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr));
2121 // Multiply the low bits (i.e. the lower 8 8bit numbers in the
2122 // registers).
2123 int16x8_t prod_16x8 =
2124 vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
2125 // Multiply the high bits (i.e. the lower 8 8bit numbers in the
2126 // registers), and accumulate with the result of the low bits product.
2127 // The assumption here is that overflow will not happen as we quantize
2128 // our values to be in the range [-127, 127]. As such the sum of the 2
2129 // products is always strictly smaller than 15-bits (32767 in absolute
2130 // value).
2131 prod_16x8 =
2132 vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
2133
2134 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
2135 row_ptr += kBlockSize;
2136 }
2137 // Add the 4 intermediate sum values to get the final dot-prod value for
2138 // this row.
2139 int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
2140 result[batch * m_rows + row] += dotprod * batch_scaling_factor;
2141 }
2142 } // for row
2143 } // for batch
2144 free(aligned_vec_free);
2145 }
2146
NeonSub1Vector(const float * vector,int v_size,float * result)2147 void NeonSub1Vector(const float* vector, int v_size, float* result) {
2148 // If v_size is not divisible by the vector size, then we need to process the
2149 // final few elements sequentially. postamble_start shows the start index
2150 // where this should happen.
2151 const int postamble_start =
2152 RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
2153
2154 float32x4_t one_f32x4 = vmovq_n_f32(1.0);
2155 int v = 0;
2156 for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
2157 // Load 4 float values from the current pointers of the input column and
2158 // subtract from 1.
2159 float32x4_t v_f32x4 = vld1q_f32(vector + v);
2160 float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
2161 // Save to output.
2162 vst1q_f32(result + v, result_f32x4);
2163 }
2164 for (; TFLITE_UNLIKELY(v < v_size); v++) {
2165 result[v] = 1.0f - vector[v];
2166 }
2167 }
2168
NeonSub1Vector(const int16_t * vector,int v_size,int16_t * result)2169 void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
2170 int postamble_start = RoundDownVectors<kInt16ValuesPerNeonVector>(v_size);
2171 static const int16_t kOne = 32767;
2172 // Use xor to replace substract from 1 << 15 - 1.
2173 // Local benchmark shows it's slightly faster than pure substract.
2174 const int16x8_t one_dup = vdupq_n_s16(kOne);
2175 int i = 0;
2176 for (; i < postamble_start; i += kInt16ValuesPerNeonVector) {
2177 const int16x8_t input = vld1q_s16(vector + i);
2178 const int16x8_t sub1_result = veorq_s16(one_dup, input);
2179 vst1q_s16(result + i, sub1_result);
2180 }
2181 for (; TFLITE_UNLIKELY(i < v_size); i++) {
2182 result[i] = kOne ^ vector[i];
2183 }
2184 }
2185
2186 namespace {
2187
2188 #ifdef __aarch64__
IsAllZero(const int8x16_t v_s8x16)2189 inline bool IsAllZero(const int8x16_t v_s8x16) {
2190 const uint32_t u32 = vmaxvq_u32(vreinterpretq_u32_s8(v_s8x16));
2191 return !u32;
2192 }
2193
IsAllZero(const float32x4_t v_f32x4)2194 inline bool IsAllZero(const float32x4_t v_f32x4) {
2195 const uint32x4_t cmp_result = vceqzq_f32(v_f32x4);
2196 const uint32_t u32 = vminvq_u32(cmp_result);
2197 return u32;
2198 }
2199 #else
2200 inline bool IsAllZero(const uint32x4_t u32x4) {
2201 const uint32x2_t u32x2 = vqadd_u32(vget_high_u32(u32x4), vget_low_u32(u32x4));
2202 const uint64x1_t u64 = vreinterpret_u64_u32(u32x2);
2203 return !vget_lane_u64(u64, 0);
2204 }
2205
2206 #ifndef __SSE__
2207 // With Intel NEON-2-SSE translator library, this is a redefinition..
2208 inline bool IsAllZero(const int8x16_t v) {
2209 return IsAllZero(vreinterpretq_u32_s8(v));
2210 }
2211 #endif
2212
2213 inline bool IsAllZero(const float32x4_t v_f32x4) {
2214 const float32x4_t zero_f32x4 = vmovq_n_f32(0.0f);
2215 // Compare-absolute greater-than, |v| > |0|, equivalently v != 0
2216 const uint32x4_t cmp_result = vcagtq_f32(v_f32x4, zero_f32x4);
2217 return IsAllZero(cmp_result);
2218 }
2219 #endif
2220
2221 } // namespace
2222
NeonIsZeroVector(const float * vector,int v_size)2223 bool NeonIsZeroVector(const float* vector, int v_size) {
2224 // If v_size is not divisible by the vector size, then we need to process the
2225 // final few elements sequentially. postamble_start shows the start index
2226 // where this should happen.
2227 const int postamble_start =
2228 RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
2229
2230 int v = 0;
2231 for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
2232 const float32x4_t v_f32x4 = vld1q_f32(vector + v);
2233 if (!IsAllZero(v_f32x4)) return false;
2234 }
2235 // Postamble loop
2236 for (; TFLITE_UNLIKELY(v < v_size); ++v) {
2237 if (vector[v] != 0.0) return false;
2238 }
2239 return true;
2240 }
2241
NeonIsZeroVector(const int8_t * vector,int v_size)2242 bool NeonIsZeroVector(const int8_t* vector, int v_size) {
2243 // If v_size is not divisible by the vector size, then we need to process the
2244 // final few elements sequentially. postamble_start shows the start index
2245 // where this should happen.
2246 const int postamble_start =
2247 RoundDownVectors<kInt8ValuesPerNeonVector>(v_size);
2248
2249 int v = 0;
2250 for (; v < postamble_start; v += kInt8ValuesPerNeonVector) {
2251 const int8x16_t v_s8x16 = vld1q_s8(vector + v);
2252 if (!IsAllZero(v_s8x16)) return false;
2253 }
2254 // Postamble loop
2255 for (; TFLITE_UNLIKELY(v < v_size); ++v) {
2256 if (vector[v] != 0) return false;
2257 }
2258 return true;
2259 }
2260
NeonVectorScalarMultiply(const int8_t * vector,const int v_size,const float scale,float * result)2261 void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
2262 const float scale, float* result) {
2263 // Here the assumption is that each buffer is 4-byte aligned.
2264 TFLITE_CHECK_EQ((intptr_t)(&vector[0]) & (kNeonVectorAlignment - 1), 0);
2265 // If v_size is not divisible by kInt8ValuesPerNeonVector, we cannot use the
2266 // main vectorized loop, and we need to process sequentially. postamble_start
2267 // shows the start index where this should happen.
2268 const int postamble_start =
2269 RoundDownVectors<kInt8ValuesPerNeonVector>(v_size);
2270
2271 // Create a vector of 4 floats with the scale value.
2272 const float32x4_t scale_f32x4 = vdupq_n_f32(scale);
2273 int v = 0;
2274 for (; v < postamble_start; v += kInt8ValuesPerNeonVector) {
2275 // Load int8 values, sixteen at a time.
2276 const int8x16_t v_i8x16 = vld1q_s8(vector + v);
2277 // Split it into two components of size eight.
2278 const int8x8_t v0_i8x8 = vget_low_s8(v_i8x16);
2279 const int8x8_t v1_i8x8 = vget_high_s8(v_i8x16);
2280 // Convert both components to int16 first.
2281 const int16x8_t v0_i16x8 = vmovl_s8(v0_i8x8);
2282 const int16x8_t v1_i16x8 = vmovl_s8(v1_i8x8);
2283 // Split each of them into two components each.
2284 const int16x4_t v0_i16x4 = vget_low_s16(v0_i16x8);
2285 const int16x4_t v1_i16x4 = vget_high_s16(v0_i16x8);
2286 const int16x4_t v2_i16x4 = vget_low_s16(v1_i16x8);
2287 const int16x4_t v3_i16x4 = vget_high_s16(v1_i16x8);
2288 // Convert these to int32 and then to float.
2289 float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4));
2290 float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4));
2291 float32x4_t v2_f32x4 = vcvtq_f32_s32(vmovl_s16(v2_i16x4));
2292 float32x4_t v3_f32x4 = vcvtq_f32_s32(vmovl_s16(v3_i16x4));
2293 // Vector multiply four floats at a time.
2294 v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4);
2295 v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4);
2296 v2_f32x4 = vmulq_f32(v2_f32x4, scale_f32x4);
2297 v3_f32x4 = vmulq_f32(v3_f32x4, scale_f32x4);
2298 // Store the results.
2299 vst1q_f32(result + v, v0_f32x4);
2300 vst1q_f32(result + v + 4, v1_f32x4);
2301 vst1q_f32(result + v + 8, v2_f32x4);
2302 vst1q_f32(result + v + 12, v3_f32x4);
2303 }
2304
2305 if (TFLITE_UNLIKELY(v_size - postamble_start >=
2306 (kInt8ValuesPerNeonVector >> 1))) {
2307 // Load eight int8 values, if there is at least eight remaining.
2308 const int8x8_t v_i8x8 = vld1_s8(vector + v);
2309 // Convert them to int16 first.
2310 const int16x8_t v_i16x8 = vmovl_s8(v_i8x8);
2311 // Split it into two components.
2312 const int16x4_t v0_i16x4 = vget_low_s16(v_i16x8);
2313 const int16x4_t v1_i16x4 = vget_high_s16(v_i16x8);
2314 // Convert the components two floats.
2315 float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4));
2316 float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4));
2317 // Vector multiply four floats at a time.
2318 v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4);
2319 v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4);
2320 // Store the results.
2321 vst1q_f32(result + v, v0_f32x4);
2322 vst1q_f32(result + v + 4, v1_f32x4);
2323 v += (kInt8ValuesPerNeonVector >> 1);
2324 }
2325
2326 // Postamble loop.
2327 for (; TFLITE_UNLIKELY(v < v_size); v++) {
2328 result[v] = scale * vector[v];
2329 }
2330 }
2331
2332 // TODO(b/185850916): Consider changing the rounding stragey from "ties to away"
2333 // to "ties to even" since vcvtnq_s32_f32 is generally more available.
RoundToNearest(const float32x4_t input)2334 inline int32x4_t RoundToNearest(const float32x4_t input) {
2335 #if __ARM_ARCH >= 8
2336 return vcvtaq_s32_f32(input);
2337 #else
2338 static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
2339 static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
2340
2341 const int32x4_t mask = vreinterpretq_s32_u32(vcltq_f32(input, zero_val_dup));
2342 const float32x4_t casted_mask = vcvtq_f32_s32(mask);
2343 const float32x4_t round = vaddq_f32(casted_mask, point5_val_dup);
2344 return vcvtq_s32_f32(vaddq_f32(input, round));
2345 #endif
2346 }
2347
2348 // Note: this function caps minimum and maximum at zero, unlike the true
2349 // minmax_element. This is intentional.
NeonMinMax(const float * values,const int size,float * min,float * max)2350 inline void NeonMinMax(const float* values, const int size, float* min,
2351 float* max) {
2352 const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(size);
2353 float rmin = 0.0f, rmax = 0.0f;
2354 int i = 0;
2355 if (postamble_start) {
2356 float32x4_t min_f32x4 = vld1q_f32(values);
2357 float32x4_t max_f32x4 = min_f32x4;
2358 for (i = kFloatValuesPerNeonVector; i < postamble_start;
2359 i += kFloatValuesPerNeonVector) {
2360 const float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
2361 min_f32x4 = vminq_f32(min_f32x4, value0_f32x4);
2362 max_f32x4 = vmaxq_f32(max_f32x4, value0_f32x4);
2363 }
2364 #ifdef __aarch64__
2365 rmin = std::min(rmin, vminvq_f32(min_f32x4));
2366 rmax = std::max(rmax, vmaxvq_f32(max_f32x4));
2367 #else
2368 float32x2_t min_f32x2 =
2369 vmin_f32(vget_low_f32(min_f32x4), vget_high_f32(min_f32x4));
2370 float32x2_t max_f32x2 =
2371 vmax_f32(vget_low_f32(max_f32x4), vget_high_f32(max_f32x4));
2372 min_f32x2 = vpmin_f32(min_f32x2, min_f32x2);
2373 max_f32x2 = vpmax_f32(max_f32x2, max_f32x2);
2374 rmin = std::min(rmin, vget_lane_f32(min_f32x2, 0));
2375 rmax = std::max(rmax, vget_lane_f32(max_f32x2, 0));
2376 #endif // __aarch64__
2377 }
2378 if (TFLITE_UNLIKELY(i < size)) {
2379 const auto minmax =
2380 std::minmax_element(values + postamble_start, values + size);
2381 rmin = std::min(rmin, *minmax.first);
2382 rmax = std::max(rmax, *minmax.second);
2383 }
2384 *min = rmin;
2385 *max = rmax;
2386 }
2387
NeonSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min,float * max,float * scaling_factor)2388 void NeonSymmetricQuantizeFloats(const float* values, const int size,
2389 int8_t* quantized_values, float* min,
2390 float* max, float* scaling_factor) {
2391 // TODO(raziel): vectorize min/max calculation.
2392 auto minmax = std::minmax_element(values, values + size);
2393 *min = *minmax.first;
2394 *max = *minmax.second;
2395 NeonSymmetricQuantizeFloats(values, size, quantized_values, *min, *max,
2396 scaling_factor);
2397 }
2398
NeonSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min,float max,float * scaling_factor)2399 void NeonSymmetricQuantizeFloats(const float* values, const int size,
2400 int8_t* quantized_values, float min, float max,
2401 float* scaling_factor) {
2402 constexpr int kScale = 127;
2403 const float range = std::max(std::abs(min), std::abs(max));
2404 if (range == 0) {
2405 memset(quantized_values, 0, size * sizeof(int8_t));
2406 *scaling_factor = 1;
2407 return;
2408 }
2409 *scaling_factor = range / kScale;
2410 const float scaling_factor_inv = kScale / range;
2411
2412 const int postamble_start =
2413 RoundDownVectors<(2 * kFloatValuesPerNeonVector)>(size);
2414
2415 // Vectorized constants.
2416 const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
2417 const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
2418 const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
2419
2420 int i = 0;
2421 for (; i < postamble_start; i += 2 * kFloatValuesPerNeonVector) {
2422 // Implements the vectorized version of the following:
2423 // const int32 quantized_value = static_cast<int32>(
2424 // std::round(*scaling_factor * values[i]));
2425 float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
2426 float32x4_t value1_f32x4 =
2427 vld1q_f32(&values[i + kFloatValuesPerNeonVector]);
2428 float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
2429 float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
2430
2431 const int32x4_t f2i0_i32x4 = RoundToNearest(mul0_f32x4);
2432 const int32x4_t f2i1_i32x4 = RoundToNearest(mul1_f32x4);
2433
2434 // Implements the vectorized version of the following block:
2435 // quantized_values[i] = std::min(kScale, std::max(-kScale,
2436 // quantized_value));
2437 int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
2438 int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
2439 int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
2440 int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
2441
2442 int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
2443 int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
2444
2445 int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
2446 int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
2447 vst1_s8(&quantized_values[i], min_s8x8);
2448 }
2449
2450 for (; TFLITE_UNLIKELY(i < size); ++i) {
2451 const int32 quantized_value =
2452 static_cast<int32>(TfLiteRound(scaling_factor_inv * values[i]));
2453 quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
2454 }
2455 }
2456
NeonAsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)2457 void NeonAsymmetricQuantizeFloats(const float* values, const int size,
2458 int8_t* quantized_values,
2459 float* scaling_factor, int32_t* offset) {
2460 float rmin, rmax;
2461 NeonMinMax(values, size, &rmin, &rmax);
2462
2463 const int32_t kMinScale = -128;
2464 const int32_t kMaxScale = 127;
2465 const double qmin_double = kMinScale;
2466 const double qmax_double = kMaxScale;
2467 if (rmin == rmax) {
2468 memset(quantized_values, 0, size * sizeof(int8_t));
2469 *scaling_factor = 1;
2470 *offset = 0;
2471 return;
2472 } else {
2473 const double scale = (rmax - rmin) / (qmax_double - qmin_double);
2474 const double zero_point_from_min = qmin_double - rmin / scale;
2475 const double zero_point_from_max = qmax_double - rmax / scale;
2476 const double zero_point_from_min_error =
2477 std::abs(qmin_double) + std::abs(rmin / scale);
2478 const double zero_point_from_max_error =
2479 std::abs(qmax_double) + std::abs(rmax / scale);
2480 const double zero_point_double =
2481 zero_point_from_min_error < zero_point_from_max_error
2482 ? zero_point_from_min
2483 : zero_point_from_max;
2484 int8 nudged_zero_point = 0;
2485 if (zero_point_double <= qmin_double) {
2486 nudged_zero_point = kMinScale;
2487 } else if (zero_point_double >= qmax_double) {
2488 nudged_zero_point = kMaxScale;
2489 } else {
2490 nudged_zero_point = static_cast<int8>(round(zero_point_double));
2491 }
2492 *scaling_factor = scale;
2493 *offset = nudged_zero_point;
2494 }
2495
2496 const int postamble_start =
2497 RoundDownVectors<(2 * kFloatValuesPerNeonVector)>(size);
2498 const float scaling_factor_inv =
2499 *scaling_factor == 0 ? 0 : 1.0 / *scaling_factor;
2500 const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
2501 const int32x4_t scale_i32x4 = vmovq_n_s32(kMaxScale);
2502 const int32x4_t neg_scale_i32x4 = vmovq_n_s32(kMinScale);
2503 const int32x4_t offset_i32x4 = vmovq_n_s32(*offset);
2504
2505 int i = 0;
2506 for (; i < postamble_start; i += 2 * kFloatValuesPerNeonVector) {
2507 float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
2508 float32x4_t value1_f32x4 =
2509 vld1q_f32(&values[i + kFloatValuesPerNeonVector]);
2510 float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
2511 float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
2512
2513 const int32x4_t f2i0_i32x4 = RoundToNearest(mul0_f32x4);
2514 const int32x4_t f2i1_i32x4 = RoundToNearest(mul1_f32x4);
2515
2516 // Add offset
2517 int32x4_t q0_i32x4 = vaddq_s32(f2i0_i32x4, offset_i32x4);
2518 int32x4_t q1_i32x4 = vaddq_s32(f2i1_i32x4, offset_i32x4);
2519
2520 int32x4_t max0_i32x4 = vmaxq_s32(q0_i32x4, neg_scale_i32x4);
2521 int32x4_t max1_i32x4 = vmaxq_s32(q1_i32x4, neg_scale_i32x4);
2522 int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
2523 int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
2524
2525 int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
2526 int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
2527
2528 int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
2529 int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
2530 vst1_s8(&quantized_values[i], min_s8x8);
2531 }
2532
2533 for (; TFLITE_UNLIKELY(i < size); ++i) {
2534 const int32 quantized_value = static_cast<int32>(
2535 *offset + TfLiteRound(scaling_factor_inv * values[i]));
2536 quantized_values[i] =
2537 std::min(kMaxScale, std::max(kMinScale, quantized_value));
2538 }
2539 }
2540
NeonVectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)2541 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
2542 int v_size) {
2543 // If v_size is not divisible by the vector size, then we need to process the
2544 // final few elements sequentially. postamble_start shows the start index
2545 // where this should happen.
2546 const int postamble_start =
2547 RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
2548 float32x4_t acc_32x4 = vmovq_n_f32(0.0);
2549 int v = 0;
2550 for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
2551 // Load 4 float values from vector1 and vector2 and accumulator.
2552 float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
2553 float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
2554 // Vector multiply-accumulate 4 float
2555 acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
2556 }
2557 float result = AccumulateNeonLane(acc_32x4);
2558 // Postamble loop.
2559 for (; TFLITE_UNLIKELY(v < v_size); v++) {
2560 result += vector1[v] * vector2[v];
2561 }
2562 return result;
2563 }
2564
NeonReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)2565 void NeonReductionSumVector(const float* input_vector, float* output_vector,
2566 int output_size, int reduction_size) {
2567 for (int o = 0; o < output_size; o++) {
2568 // If v_size is not divisible by the vector size, then we need to process
2569 // the final few elements sequentially. postamble_start shows the start
2570 // index where this should happen.
2571 const int postamble_start =
2572 RoundDownVectors<kFloatValuesPerNeonVector>(reduction_size);
2573 float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
2574 int r = 0;
2575 for (; r < postamble_start; r += kFloatValuesPerNeonVector) {
2576 float32x4_t v1_f32x4 = vld1q_f32(input_vector + r);
2577 sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
2578 }
2579 float sum = AccumulateNeonLane(sum_f32x4);
2580 // Postamble loop.
2581 for (; TFLITE_UNLIKELY(r < reduction_size); r++) {
2582 sum += input_vector[r];
2583 }
2584 output_vector[o] = sum;
2585 input_vector += reduction_size;
2586 }
2587 }
2588
NeonReductionSumVector(const int8_t * input_vector,int32_t * output_vector,const int output_size,const int reduction_size)2589 void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
2590 const int output_size, const int reduction_size) {
2591 const int postamble_half_start =
2592 RoundDownVectors<kInt8ValuesPerNeonVector>(reduction_size);
2593 const int postamble_start =
2594 RoundDownVectors<(kInt8ValuesPerNeonVector / 2)>(reduction_size);
2595 for (int o = 0; o < output_size; ++o) {
2596 int32x4_t sum_32x4 = vmovq_n_s32(0);
2597 int r = 0;
2598 for (; r < postamble_half_start; r += kInt8ValuesPerNeonVector) {
2599 const int8x16_t s2_8x16 = vld1q_s8(input_vector + r);
2600 sum_32x4 = vpadalq_s16(sum_32x4, vpaddlq_s8(s2_8x16));
2601 }
2602 if (TFLITE_UNLIKELY(r < postamble_start)) {
2603 const int8x8_t s2_8x8 = vld1_s8(input_vector + r);
2604 sum_32x4 = vpadalq_s16(sum_32x4, vmovl_s8(s2_8x8));
2605 r += (kInt8ValuesPerNeonVector >> 1);
2606 }
2607 int32_t sum = AccumulateNeonLane(sum_32x4);
2608 for (; TFLITE_UNLIKELY(r < reduction_size); ++r) {
2609 sum += input_vector[r];
2610 }
2611 output_vector[o] = sum;
2612 input_vector += reduction_size;
2613 }
2614 }
2615
NeonVectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)2616 void NeonVectorBatchVectorCwiseProductAccumulate(
2617 const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
2618 int32_t multiplier, int shift, int16_t* result) {
2619 int32x4_t min_value_vector = vdupq_n_s32(-32768);
2620 int32x4_t max_value_vector = vdupq_n_s32(32767);
2621
2622 for (int b = 0; b < n_batch; b++) {
2623 int v = 0;
2624 for (; v <= v_size - 16; v += 16) {
2625 int32x4x4_t prod;
2626 prod.val[0] = vmull_s16(vld1_s16(vector + v), vld1_s16(batch_vector));
2627 prod.val[1] =
2628 vmull_s16(vld1_s16(vector + v + 4), vld1_s16(batch_vector + 4));
2629 prod.val[2] =
2630 vmull_s16(vld1_s16(vector + v + 8), vld1_s16(batch_vector + 8));
2631 prod.val[3] =
2632 vmull_s16(vld1_s16(vector + v + 12), vld1_s16(batch_vector + 12));
2633 batch_vector += 16;
2634
2635 prod = MultiplyByQuantizedMultiplier4Rows(prod, multiplier, shift);
2636
2637 int16x4x4_t results;
2638 results.val[0] = vld1_s16(result);
2639 results.val[1] = vld1_s16(result + 4);
2640 results.val[2] = vld1_s16(result + 8);
2641 results.val[3] = vld1_s16(result + 12);
2642
2643 prod.val[0] = vaddq_s32(prod.val[0], vmovl_s16(results.val[0]));
2644 prod.val[1] = vaddq_s32(prod.val[1], vmovl_s16(results.val[1]));
2645 prod.val[2] = vaddq_s32(prod.val[2], vmovl_s16(results.val[2]));
2646 prod.val[3] = vaddq_s32(prod.val[3], vmovl_s16(results.val[3]));
2647
2648 prod.val[0] = vmaxq_s32(prod.val[0], min_value_vector);
2649 prod.val[1] = vmaxq_s32(prod.val[1], min_value_vector);
2650 prod.val[2] = vmaxq_s32(prod.val[2], min_value_vector);
2651 prod.val[3] = vmaxq_s32(prod.val[3], min_value_vector);
2652
2653 prod.val[0] = vminq_s32(prod.val[0], max_value_vector);
2654 prod.val[1] = vminq_s32(prod.val[1], max_value_vector);
2655 prod.val[2] = vminq_s32(prod.val[2], max_value_vector);
2656 prod.val[3] = vminq_s32(prod.val[3], max_value_vector);
2657
2658 vst1_s16(result, vmovn_s32(prod.val[0]));
2659 vst1_s16(result + 4, vmovn_s32(prod.val[1]));
2660 vst1_s16(result + 8, vmovn_s32(prod.val[2]));
2661 vst1_s16(result + 12, vmovn_s32(prod.val[3]));
2662
2663 result += 16;
2664 }
2665
2666 for (; TFLITE_UNLIKELY(v < v_size); v++) {
2667 int32_t prod = vector[v] * *batch_vector++;
2668 prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
2669 int32_t output = prod + *result;
2670 output = std::max(std::min(32767, output), -32768);
2671 *result++ = output;
2672 }
2673 }
2674 }
2675
NeonMeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)2676 void NeonMeanStddevNormalization(const float* __restrict__ input_vector,
2677 float* __restrict__ output_vector, int v_size,
2678 int n_batch) {
2679 constexpr int kBlockSize = kFloatValuesPerNeonVector * 4;
2680
2681 for (int batch = 0; batch < n_batch; ++batch) {
2682 // Calculate sum
2683 float32x4_t sum_f32x4_0 = vdupq_n_f32(0.0f);
2684 float32x4_t sum_f32x4_1 = vdupq_n_f32(0.0f);
2685 float32x4_t sum_f32x4_2 = vdupq_n_f32(0.0f);
2686 float32x4_t sum_f32x4_3 = vdupq_n_f32(0.0f);
2687 int i = 0;
2688 for (; i <= v_size - kBlockSize; i += kBlockSize) {
2689 const float32x4_t input_f32x4_0 =
2690 vld1q_f32(input_vector + i + 0 * kFloatValuesPerNeonVector);
2691 const float32x4_t input_f32x4_1 =
2692 vld1q_f32(input_vector + i + 1 * kFloatValuesPerNeonVector);
2693 const float32x4_t input_f32x4_2 =
2694 vld1q_f32(input_vector + i + 2 * kFloatValuesPerNeonVector);
2695 const float32x4_t input_f32x4_3 =
2696 vld1q_f32(input_vector + i + 3 * kFloatValuesPerNeonVector);
2697 sum_f32x4_0 = vaddq_f32(sum_f32x4_0, input_f32x4_0);
2698 sum_f32x4_1 = vaddq_f32(sum_f32x4_1, input_f32x4_1);
2699 sum_f32x4_2 = vaddq_f32(sum_f32x4_2, input_f32x4_2);
2700 sum_f32x4_3 = vaddq_f32(sum_f32x4_3, input_f32x4_3);
2701 }
2702 sum_f32x4_0 = vaddq_f32(sum_f32x4_0, sum_f32x4_2);
2703 sum_f32x4_1 = vaddq_f32(sum_f32x4_1, sum_f32x4_3);
2704 sum_f32x4_0 = vaddq_f32(sum_f32x4_0, sum_f32x4_1);
2705 float sum = AccumulateNeonLane(sum_f32x4_0);
2706 for (; TFLITE_UNLIKELY(i < v_size); ++i) {
2707 sum += input_vector[i];
2708 }
2709 // Calculate mean
2710 const float mean = sum / v_size;
2711 const float32x4_t mean_f32x4 = vdupq_n_f32(mean);
2712 // Calculate sum of squared differences
2713 float32x4_t sum_diff_sq_f32x4_0 = vdupq_n_f32(0.0f);
2714 float32x4_t sum_diff_sq_f32x4_1 = vdupq_n_f32(0.0f);
2715 float32x4_t sum_diff_sq_f32x4_2 = vdupq_n_f32(0.0f);
2716 float32x4_t sum_diff_sq_f32x4_3 = vdupq_n_f32(0.0f);
2717 i = 0;
2718 for (; i <= v_size - kBlockSize; i += kBlockSize) {
2719 const float32x4_t input_f32x4_0 =
2720 vld1q_f32(input_vector + i + 0 * kFloatValuesPerNeonVector);
2721 const float32x4_t input_f32x4_1 =
2722 vld1q_f32(input_vector + i + 1 * kFloatValuesPerNeonVector);
2723 const float32x4_t input_f32x4_2 =
2724 vld1q_f32(input_vector + i + 2 * kFloatValuesPerNeonVector);
2725 const float32x4_t input_f32x4_3 =
2726 vld1q_f32(input_vector + i + 3 * kFloatValuesPerNeonVector);
2727 const float32x4_t diff_f32x4_0 = vsubq_f32(input_f32x4_0, mean_f32x4);
2728 const float32x4_t diff_f32x4_1 = vsubq_f32(input_f32x4_1, mean_f32x4);
2729 const float32x4_t diff_f32x4_2 = vsubq_f32(input_f32x4_2, mean_f32x4);
2730 const float32x4_t diff_f32x4_3 = vsubq_f32(input_f32x4_3, mean_f32x4);
2731 sum_diff_sq_f32x4_0 =
2732 vmlaq_f32(sum_diff_sq_f32x4_0, diff_f32x4_0, diff_f32x4_0);
2733 sum_diff_sq_f32x4_1 =
2734 vmlaq_f32(sum_diff_sq_f32x4_1, diff_f32x4_1, diff_f32x4_1);
2735 sum_diff_sq_f32x4_2 =
2736 vmlaq_f32(sum_diff_sq_f32x4_2, diff_f32x4_2, diff_f32x4_2);
2737 sum_diff_sq_f32x4_3 =
2738 vmlaq_f32(sum_diff_sq_f32x4_3, diff_f32x4_3, diff_f32x4_3);
2739 }
2740 sum_diff_sq_f32x4_0 = vaddq_f32(sum_diff_sq_f32x4_0, sum_diff_sq_f32x4_2);
2741 sum_diff_sq_f32x4_1 = vaddq_f32(sum_diff_sq_f32x4_1, sum_diff_sq_f32x4_3);
2742 sum_diff_sq_f32x4_0 = vaddq_f32(sum_diff_sq_f32x4_0, sum_diff_sq_f32x4_1);
2743 float sum_diff_sq = AccumulateNeonLane(sum_diff_sq_f32x4_0);
2744 for (; TFLITE_UNLIKELY(i < v_size); ++i) {
2745 const float diff = input_vector[i] - mean;
2746 sum_diff_sq += diff * diff;
2747 }
2748 // Calculate 1/stddev
2749 const float variance = sum_diff_sq / v_size;
2750 constexpr float kNormalizationConstant = 1e-8f;
2751 const float stddev_inv =
2752 1.0f / std::sqrt(variance + kNormalizationConstant);
2753 // Do the normalization
2754 i = 0;
2755 for (; i <= v_size - kBlockSize; i += kBlockSize) {
2756 const float32x4_t input_f32x4_0 =
2757 vld1q_f32(input_vector + i + 0 * kFloatValuesPerNeonVector);
2758 const float32x4_t input_f32x4_1 =
2759 vld1q_f32(input_vector + i + 1 * kFloatValuesPerNeonVector);
2760 const float32x4_t input_f32x4_2 =
2761 vld1q_f32(input_vector + i + 2 * kFloatValuesPerNeonVector);
2762 const float32x4_t input_f32x4_3 =
2763 vld1q_f32(input_vector + i + 3 * kFloatValuesPerNeonVector);
2764 const float32x4_t tmp_0 = vsubq_f32(input_f32x4_0, mean_f32x4);
2765 const float32x4_t tmp_1 = vsubq_f32(input_f32x4_1, mean_f32x4);
2766 const float32x4_t tmp_2 = vsubq_f32(input_f32x4_2, mean_f32x4);
2767 const float32x4_t tmp_3 = vsubq_f32(input_f32x4_3, mean_f32x4);
2768 const float32x4_t output_f32x4_0 = vmulq_n_f32(tmp_0, stddev_inv);
2769 const float32x4_t output_f32x4_1 = vmulq_n_f32(tmp_1, stddev_inv);
2770 const float32x4_t output_f32x4_2 = vmulq_n_f32(tmp_2, stddev_inv);
2771 const float32x4_t output_f32x4_3 = vmulq_n_f32(tmp_3, stddev_inv);
2772 vst1q_f32(output_vector + i + 0 * kFloatValuesPerNeonVector,
2773 output_f32x4_0);
2774 vst1q_f32(output_vector + i + 1 * kFloatValuesPerNeonVector,
2775 output_f32x4_1);
2776 vst1q_f32(output_vector + i + 2 * kFloatValuesPerNeonVector,
2777 output_f32x4_2);
2778 vst1q_f32(output_vector + i + 3 * kFloatValuesPerNeonVector,
2779 output_f32x4_3);
2780 }
2781 for (; TFLITE_UNLIKELY(i < v_size); ++i) {
2782 output_vector[i] = (input_vector[i] - mean) * stddev_inv;
2783 }
2784 // Advance to next batch
2785 input_vector += v_size;
2786 output_vector += v_size;
2787 }
2788 }
2789
2790 } // namespace tensor_utils
2791 } // namespace tflite
2792
2793 #endif // USE_NEON
2794