1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/arm/convolve_neon.h"
21 #include "av1/common/arm/convolve_neon_i8mm.h"
22 #include "av1/common/convolve.h"
23 #include "av1/common/filter.h"
24
25 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
26 // Shift left and insert new last column in transposed 4x4 block.
27 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
28 // Shift left and insert two new columns in transposed 4x4 block.
29 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
30 // Shift left and insert three new columns in transposed 4x4 block.
31 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
32 };
33
convolve12_4_x(uint8x16_t samples[2],const int8x16_t filter[2],const uint8x16_t permute_tbl,const int32x4_t horiz_const)34 static inline int16x4_t convolve12_4_x(uint8x16_t samples[2],
35 const int8x16_t filter[2],
36 const uint8x16_t permute_tbl,
37 const int32x4_t horiz_const) {
38 // Permute samples ready for matrix multiply.
39 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
40 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }
41 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl),
42 vqtbl1q_u8(samples[1], permute_tbl) };
43
44 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
45 // (filter), destructively accumulating into the destination register.
46 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
47 sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]);
48
49 return vqrshrn_n_s32(sum, FILTER_BITS);
50 }
51
convolve12_8_x(uint8x16_t samples[2],const int8x16_t filter[2],const uint8x16x2_t permute_tbl,const int32x4_t horiz_const)52 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
53 const int8x16_t filter[2],
54 const uint8x16x2_t permute_tbl,
55 const int32x4_t horiz_const) {
56 // Permute samples ready for matrix multiply.
57 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
58 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }
59 // { 6, 7, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 }
60 // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 }
61 uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
62 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
63 vqtbl1q_u8(samples[1], permute_tbl.val[0]),
64 vqtbl1q_u8(samples[1], permute_tbl.val[1]) };
65
66 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
67 // (filter), destructively accumulating into the destination register.
68 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
69 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]);
70 sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
71 sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
72
73 // Narrow and re-pack.
74 int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
75 vqrshrn_n_s32(sum4567, FILTER_BITS));
76 return vqmovun_s16(sum_s16);
77 }
78
convolve_x_sr_12tap_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr)79 static inline void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
80 int src_stride, uint8_t *dst,
81 int dst_stride, int w, int h,
82 const int16_t *x_filter_ptr) {
83 // The no-op filter should never be used here.
84 assert(x_filter_ptr[5] != 128);
85
86 // Split 12-tap filter into two 6-tap filters, masking the top two elements.
87 // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
88 const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
89 const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask);
90 const int8x8_t filter_1 =
91 vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2);
92
93 // Stagger each 6-tap filter to enable use of matrix multiply instructions.
94 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
95 const int8x16_t filter[2] = {
96 vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
97 vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
98 };
99
100 // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
101 // convolution kernels: Adding this shim enables us to use a single rounding
102 // right shift by FILTER_BITS instead of two rounding right shifts: first by
103 // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
104 const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
105
106 if (w <= 4) {
107 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
108
109 do {
110 uint8x16_t s0[2], s1[2], s2[2], s3[2];
111 load_u8_16x4(src, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
112 load_u8_16x4(src + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
113
114 int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
115 int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
116 int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
117 int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
118
119 uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
120 uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
121
122 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
123 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
124
125 dst += 4 * dst_stride;
126 src += 4 * src_stride;
127 h -= 4;
128 } while (h != 0);
129 } else {
130 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
131
132 do {
133 const uint8_t *s = src;
134 uint8_t *d = dst;
135 int width = w;
136
137 do {
138 uint8x16_t s0[2], s1[2], s2[2], s3[2];
139 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
140 load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
141
142 uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
143 uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
144 uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
145 uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
146
147 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
148
149 s += 8;
150 d += 8;
151 width -= 8;
152 } while (width != 0);
153 src += 4 * src_stride;
154 dst += 4 * dst_stride;
155 h -= 4;
156 } while (h != 0);
157 }
158 }
159
convolve8_8_x(uint8x16_t samples,const int8x8_t filter,const uint8x16x3_t permute_tbl,const int32x4_t horiz_const)160 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
161 const uint8x16x3_t permute_tbl,
162 const int32x4_t horiz_const) {
163 // Permute samples ready for dot product.
164 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
165 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
166 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
167 uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
168 vqtbl1q_u8(samples, permute_tbl.val[1]),
169 vqtbl1q_u8(samples, permute_tbl.val[2]) };
170
171 int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0);
172 sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
173
174 int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0);
175 sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
176
177 int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
178 // We halved the convolution filter values so - 1 from the right shift.
179 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
180 }
181
convolve_x_sr_8tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_x,const int32x4_t horiz_const)182 static inline void convolve_x_sr_8tap_neon_i8mm(
183 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
184 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
185 const int32x4_t horiz_const) {
186 // Filter values are even, so halve to reduce intermediate precision reqs.
187 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
188 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
189
190 do {
191 const uint8_t *s = src;
192 uint8_t *d = dst;
193 int w = width;
194
195 do {
196 uint8x16_t s0, s1, s2, s3;
197 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
198
199 uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
200 uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
201 uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
202 uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
203
204 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
205
206 s += 8;
207 d += 8;
208 w -= 8;
209 } while (w != 0);
210 src += 4 * src_stride;
211 dst += 4 * dst_stride;
212 height -= 4;
213 } while (height != 0);
214 }
215
convolve6_4_x(uint8x16_t samples,const int8x16_t filter,const uint8x16_t permute_tbl,const int32x4_t horiz_const)216 static inline int16x4_t convolve6_4_x(uint8x16_t samples,
217 const int8x16_t filter,
218 const uint8x16_t permute_tbl,
219 const int32x4_t horiz_const) {
220 // Permute samples ready for matrix multiply.
221 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
222 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
223
224 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
225 // (filter), destructively accumulating into the destination register.
226 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
227
228 // Further narrowing and packing is performed by the caller.
229 return vmovn_s32(sum);
230 }
231
convolve6_8_x(uint8x16_t samples,const int8x16_t filter,const uint8x16x2_t permute_tbl,const int32x4_t horiz_const)232 static inline uint8x8_t convolve6_8_x(uint8x16_t samples,
233 const int8x16_t filter,
234 const uint8x16x2_t permute_tbl,
235 const int32x4_t horiz_const) {
236 // Permute samples ready for matrix multiply.
237 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
238 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }
239 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
240 vqtbl1q_u8(samples, permute_tbl.val[1]) };
241
242 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
243 // (filter), destructively accumulating into the destination register.
244 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
245 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
246
247 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
248 // We halved the convolution filter values so - 1 from the right shift.
249 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
250 }
251
convolve_x_sr_6tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_x,const int32x4_t horiz_const)252 static inline void convolve_x_sr_6tap_neon_i8mm(
253 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
254 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
255 const int32x4_t horiz_const) {
256 // Filter values are even, so halve to reduce intermediate precision reqs.
257 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1);
258 // Stagger the filter for use with the matrix multiply instructions.
259 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
260 const int8x16_t x_filter =
261 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
262
263 if (width == 4) {
264 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
265 do {
266 uint8x16_t s0, s1, s2, s3;
267 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
268
269 int16x4_t t0 = convolve6_4_x(s0, x_filter, permute_tbl, horiz_const);
270 int16x4_t t1 = convolve6_4_x(s1, x_filter, permute_tbl, horiz_const);
271 int16x4_t t2 = convolve6_4_x(s2, x_filter, permute_tbl, horiz_const);
272 int16x4_t t3 = convolve6_4_x(s3, x_filter, permute_tbl, horiz_const);
273 // We halved the filter values so -1 from right shift.
274 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
275 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
276
277 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
278 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
279
280 src += 4 * src_stride;
281 dst += 4 * dst_stride;
282 height -= 4;
283 } while (height != 0);
284 } else {
285 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
286 do {
287 const uint8_t *s = src;
288 uint8_t *d = dst;
289 int w = width;
290
291 do {
292 uint8x16_t s0, s1, s2, s3;
293 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
294
295 uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const);
296 uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const);
297 uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const);
298 uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const);
299
300 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
301
302 s += 8;
303 d += 8;
304 w -= 8;
305 } while (w != 0);
306 src += 4 * src_stride;
307 dst += 4 * dst_stride;
308 height -= 4;
309 } while (height != 0);
310 }
311 }
312
av1_convolve_x_sr_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)313 void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
314 uint8_t *dst, int dst_stride, int w, int h,
315 const InterpFilterParams *filter_params_x,
316 const int subpel_x_qn,
317 ConvolveParams *conv_params) {
318 if (w == 2 || h == 2) {
319 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
320 subpel_x_qn, conv_params);
321 return;
322 }
323
324 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
325 src -= horiz_offset;
326
327 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
328 filter_params_x, subpel_x_qn & SUBPEL_MASK);
329
330 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
331
332 // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
333 // convolution kernels: Adding this shim enables us to use a single rounding
334 // right shift by FILTER_BITS instead of two rounding right shifts: first by
335 // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
336 // Halve the total because we will halve the filter values.
337 const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2));
338
339 if (filter_taps <= 6) {
340 convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h,
341 x_filter_ptr, horiz_const);
342 return;
343 }
344
345 if (filter_taps > 8) {
346 convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
347 x_filter_ptr);
348 return;
349 }
350
351 convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
352 x_filter_ptr, horiz_const);
353 }
354
transpose_concat_4x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b)355 static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
356 uint8x8_t a2, uint8x8_t a3,
357 uint8x16_t *b) {
358 // Transpose 8-bit elements and concatenate result rows as follows:
359 // a0: 00, 01, 02, 03, XX, XX, XX, XX
360 // a1: 10, 11, 12, 13, XX, XX, XX, XX
361 // a2: 20, 21, 22, 23, XX, XX, XX, XX
362 // a3: 30, 31, 32, 33, XX, XX, XX, XX
363 //
364 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
365
366 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
367 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
368 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
369 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
370
371 uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
372 uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
373
374 uint16x8_t a0123 =
375 vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
376
377 *b = vreinterpretq_u8_u16(a0123);
378 }
379
transpose_concat_8x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b0,uint8x16_t * b1)380 static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
381 uint8x8_t a2, uint8x8_t a3,
382 uint8x16_t *b0, uint8x16_t *b1) {
383 // Transpose 8-bit elements and concatenate result rows as follows:
384 // a0: 00, 01, 02, 03, 04, 05, 06, 07
385 // a1: 10, 11, 12, 13, 14, 15, 16, 17
386 // a2: 20, 21, 22, 23, 24, 25, 26, 27
387 // a3: 30, 31, 32, 33, 34, 35, 36, 37
388 //
389 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
390 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
391
392 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
393 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
394 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
395 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
396
397 uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
398 uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
399
400 uint16x8x2_t a0123 =
401 vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
402
403 *b0 = vreinterpretq_u8_u16(a0123.val[0]);
404 *b1 = vreinterpretq_u8_u16(a0123.val[1]);
405 }
406
convolve12_4_y(const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const int8x8_t filters_0_7,const int8x8_t filters_4_11)407 static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
408 const uint8x16_t s2,
409 const int8x8_t filters_0_7,
410 const int8x8_t filters_4_11) {
411 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0);
412 sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1);
413 sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1);
414
415 // Further narrowing and packing is performed by the caller.
416 return vqmovn_s32(sum);
417 }
418
convolve12_8_y(const uint8x16_t s0_lo,const uint8x16_t s0_hi,const uint8x16_t s1_lo,const uint8x16_t s1_hi,const uint8x16_t s2_lo,const uint8x16_t s2_hi,const int8x8_t filters_0_7,const int8x8_t filters_4_11)419 static inline uint8x8_t convolve12_8_y(
420 const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo,
421 const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi,
422 const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
423 int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0);
424 sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
425 sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
426
427 int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0);
428 sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
429 sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
430
431 // Narrow and re-pack.
432 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
433 return vqrshrun_n_s16(sum, FILTER_BITS);
434 }
435
convolve_y_sr_12tap_neon_i8mm(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)436 static inline void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
437 int src_stride,
438 uint8_t *dst_ptr,
439 int dst_stride, int w, int h,
440 const int16_t *y_filter_ptr) {
441 // The no-op filter should never be used here.
442 assert(y_filter_ptr[5] != 128);
443
444 const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
445 const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
446
447 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
448
449 if (w == 4) {
450 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
451 load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
452 &s8, &s9, &sA);
453 src_ptr += 11 * src_stride;
454
455 // This operation combines a conventional transpose and the sample permute
456 // (see horizontal case) required before computing the dot product.
457 uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
458 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
459 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
460 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
461 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
462 transpose_concat_4x4(s4, s5, s6, s7, &s4567);
463 transpose_concat_4x4(s5, s6, s7, s8, &s5678);
464 transpose_concat_4x4(s6, s7, s8, s9, &s6789);
465 transpose_concat_4x4(s7, s8, s9, sA, &s789A);
466
467 do {
468 uint8x8_t sB, sC, sD, sE;
469 load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
470
471 uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
472 transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
473
474 // Merge new data into block from previous iteration.
475 uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
476 s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
477 s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
478 sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
479
480 int16x4_t d0 =
481 convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
482 int16x4_t d1 =
483 convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
484 int16x4_t d2 =
485 convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
486 int16x4_t d3 =
487 convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
488 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
489 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
490
491 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
492 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
493
494 // Prepare block for next iteration - re-using as much as possible.
495 // Shuffle everything up four rows.
496 s0123 = s4567;
497 s1234 = s5678;
498 s2345 = s6789;
499 s3456 = s789A;
500 s4567 = s89AB;
501 s5678 = s9ABC;
502 s6789 = sABCD;
503 s789A = sBCDE;
504
505 src_ptr += 4 * src_stride;
506 dst_ptr += 4 * dst_stride;
507 h -= 4;
508 } while (h != 0);
509 } else {
510 do {
511 int height = h;
512 const uint8_t *s = src_ptr;
513 uint8_t *d = dst_ptr;
514
515 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
516 load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
517 &s9, &sA);
518 s += 11 * src_stride;
519
520 // This operation combines a conventional transpose and the sample
521 // permute (see horizontal case) required before computing the dot
522 // product.
523 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
524 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
525 s6789_hi, s789A_lo, s789A_hi;
526 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
527 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
528 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
529 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
530 transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
531 transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
532 transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
533 transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
534
535 do {
536 uint8x8_t sB, sC, sD, sE;
537 load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
538
539 uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
540 sBCDE_lo, sBCDE_hi;
541 transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
542
543 // Merge new data into block from previous iteration.
544 uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
545 s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
546 s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
547 sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
548
549 uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
550 s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
551 s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
552 sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
553
554 uint8x8_t d0 =
555 convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
556 s89AB_hi, filter_0_7, filter_4_11);
557 uint8x8_t d1 =
558 convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
559 s9ABC_hi, filter_0_7, filter_4_11);
560 uint8x8_t d2 =
561 convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
562 sABCD_hi, filter_0_7, filter_4_11);
563 uint8x8_t d3 =
564 convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
565 sBCDE_hi, filter_0_7, filter_4_11);
566
567 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
568
569 // Prepare block for next iteration - re-using as much as possible.
570 // Shuffle everything up four rows.
571 s0123_lo = s4567_lo;
572 s0123_hi = s4567_hi;
573 s1234_lo = s5678_lo;
574 s1234_hi = s5678_hi;
575 s2345_lo = s6789_lo;
576 s2345_hi = s6789_hi;
577 s3456_lo = s789A_lo;
578 s3456_hi = s789A_hi;
579 s4567_lo = s89AB_lo;
580 s4567_hi = s89AB_hi;
581 s5678_lo = s9ABC_lo;
582 s5678_hi = s9ABC_hi;
583 s6789_lo = sABCD_lo;
584 s6789_hi = sABCD_hi;
585 s789A_lo = sBCDE_lo;
586 s789A_hi = sBCDE_hi;
587
588 s += 4 * src_stride;
589 d += 4 * dst_stride;
590 height -= 4;
591 } while (height != 0);
592 src_ptr += 8;
593 dst_ptr += 8;
594 w -= 8;
595 } while (w != 0);
596 }
597 }
598
convolve8_4_y(const uint8x16_t s0,const uint8x16_t s1,const int8x8_t filters)599 static inline int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
600 const int8x8_t filters) {
601 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
602 sum = vusdotq_lane_s32(sum, s1, filters, 1);
603
604 // Further narrowing and packing is performed by the caller.
605 return vqmovn_s32(sum);
606 }
607
convolve8_8_y(const uint8x16_t s0_lo,const uint8x16_t s0_hi,const uint8x16_t s1_lo,const uint8x16_t s1_hi,const int8x8_t filters)608 static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
609 const uint8x16_t s0_hi,
610 const uint8x16_t s1_lo,
611 const uint8x16_t s1_hi,
612 const int8x8_t filters) {
613 int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0);
614 sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1);
615
616 int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0);
617 sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1);
618
619 // Narrow and re-pack.
620 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
621 return vqrshrun_n_s16(sum, FILTER_BITS);
622 }
623
convolve_y_sr_8tap_neon_i8mm(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)624 static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
625 int src_stride,
626 uint8_t *dst_ptr,
627 int dst_stride, int w, int h,
628 const int16_t *y_filter_ptr) {
629 const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
630
631 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
632
633 if (w == 4) {
634 uint8x8_t s0, s1, s2, s3, s4, s5, s6;
635 load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
636 src_ptr += 7 * src_stride;
637
638 // This operation combines a conventional transpose and the sample permute
639 // (see horizontal case) required before computing the dot product.
640 uint8x16_t s0123, s1234, s2345, s3456;
641 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
642 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
643 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
644 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
645
646 do {
647 uint8x8_t s7, s8, s9, s10;
648 load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
649
650 uint8x16_t s4567, s5678, s6789, s78910;
651 transpose_concat_4x4(s7, s8, s9, s10, &s78910);
652
653 // Merge new data into block from previous iteration.
654 uint8x16x2_t samples_LUT = { { s3456, s78910 } };
655 s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
656 s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
657 s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
658
659 int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
660 int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
661 int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
662 int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
663 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
664 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
665
666 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
667 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
668
669 // Prepare block for next iteration - re-using as much as possible.
670 // Shuffle everything up four rows.
671 s0123 = s4567;
672 s1234 = s5678;
673 s2345 = s6789;
674 s3456 = s78910;
675
676 src_ptr += 4 * src_stride;
677 dst_ptr += 4 * dst_stride;
678 h -= 4;
679 } while (h != 0);
680 } else {
681 do {
682 int height = h;
683 const uint8_t *s = src_ptr;
684 uint8_t *d = dst_ptr;
685
686 uint8x8_t s0, s1, s2, s3, s4, s5, s6;
687 load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
688 s += 7 * src_stride;
689
690 // This operation combines a conventional transpose and the sample
691 // permute (see horizontal case) required before computing the dot
692 // product.
693 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
694 s3456_lo, s3456_hi;
695 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
696 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
697 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
698 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
699
700 do {
701 uint8x8_t s7, s8, s9, s10;
702 load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
703
704 uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
705 s78910_lo, s78910_hi;
706 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
707
708 // Merge new data into block from previous iteration.
709 uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
710 s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
711 s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
712 s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
713
714 uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
715 s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
716 s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
717 s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
718
719 uint8x8_t d0 =
720 convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
721 uint8x8_t d1 =
722 convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
723 uint8x8_t d2 =
724 convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
725 uint8x8_t d3 =
726 convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
727
728 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
729
730 // Prepare block for next iteration - re-using as much as possible.
731 // Shuffle everything up four rows.
732 s0123_lo = s4567_lo;
733 s0123_hi = s4567_hi;
734 s1234_lo = s5678_lo;
735 s1234_hi = s5678_hi;
736 s2345_lo = s6789_lo;
737 s2345_hi = s6789_hi;
738 s3456_lo = s78910_lo;
739 s3456_hi = s78910_hi;
740
741 s += 4 * src_stride;
742 d += 4 * dst_stride;
743 height -= 4;
744 } while (height != 0);
745 src_ptr += 8;
746 dst_ptr += 8;
747 w -= 8;
748 } while (w != 0);
749 }
750 }
751
av1_convolve_y_sr_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)752 void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
753 uint8_t *dst, int dst_stride, int w, int h,
754 const InterpFilterParams *filter_params_y,
755 const int subpel_y_qn) {
756 if (w == 2 || h == 2) {
757 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
758 subpel_y_qn);
759 return;
760 }
761
762 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
763
764 if (y_filter_taps <= 6) {
765 av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
766 filter_params_y, subpel_y_qn);
767 return;
768 }
769
770 const int vert_offset = y_filter_taps / 2 - 1;
771 src -= vert_offset * src_stride;
772
773 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
774 filter_params_y, subpel_y_qn & SUBPEL_MASK);
775
776 if (y_filter_taps > 8) {
777 convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
778 y_filter_ptr);
779 return;
780 }
781 convolve_y_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
782 y_filter_ptr);
783 }
784
convolve8_8_2d_h(uint8x16_t samples,const int8x8_t filters,const uint8x16x3_t permute_tbl,const int32x4_t horiz_const)785 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
786 const int8x8_t filters,
787 const uint8x16x3_t permute_tbl,
788 const int32x4_t horiz_const) {
789 // Permute samples ready for dot product.
790 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
791 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
792 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
793 uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
794 vqtbl1q_u8(samples, permute_tbl.val[1]),
795 vqtbl1q_u8(samples, permute_tbl.val[2]) };
796
797 int32x4_t sum0123 =
798 vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
799 sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
800
801 int32x4_t sum4567 =
802 vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
803 sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
804
805 // Narrow and re-pack.
806 // We halved the convolution filter values so -1 from the right shift.
807 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
808 vshrn_n_s32(sum4567, ROUND0_BITS - 1));
809 }
810
convolve_2d_sr_horiz_8tap_neon_i8mm(const uint8_t * src,int src_stride,int16_t * im_block,int im_stride,int w,int im_h,const int16_t * x_filter_ptr)811 static inline void convolve_2d_sr_horiz_8tap_neon_i8mm(
812 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
813 int im_h, const int16_t *x_filter_ptr) {
814 // Filter values are even, so halve to reduce intermediate precision reqs.
815 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
816
817 const int bd = 8;
818 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
819 // shifts - which are generally faster than rounding shifts on modern CPUs.
820 // The outermost -1 is needed because we halved the filter values.
821 const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
822 (1 << ((ROUND0_BITS - 1) - 1)));
823
824 const uint8_t *src_ptr = src;
825 int16_t *dst_ptr = im_block;
826 int dst_stride = im_stride;
827 int height = im_h;
828
829 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
830 do {
831 const uint8_t *s = src_ptr;
832 int16_t *d = dst_ptr;
833 int width = w;
834
835 do {
836 uint8x16_t s0, s1, s2, s3;
837 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
838
839 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
840 int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
841 int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
842 int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
843
844 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
845
846 s += 8;
847 d += 8;
848 width -= 8;
849 } while (width != 0);
850 src_ptr += 4 * src_stride;
851 dst_ptr += 4 * dst_stride;
852 height -= 4;
853 } while (height > 4);
854
855 do {
856 const uint8_t *s = src_ptr;
857 int16_t *d = dst_ptr;
858 int width = w;
859
860 do {
861 uint8x16_t s0 = vld1q_u8(s);
862 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
863 vst1q_s16(d, d0);
864
865 s += 8;
866 d += 8;
867 width -= 8;
868 } while (width != 0);
869 src_ptr += src_stride;
870 dst_ptr += dst_stride;
871 } while (--height != 0);
872 }
873
convolve4_4_2d_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16_t permute_tbl,const int32x4_t horiz_const)874 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
875 const int8x8_t filters,
876 const uint8x16_t permute_tbl,
877 const int32x4_t horiz_const) {
878 // Permute samples ready for dot product.
879 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
880 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
881
882 int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0);
883
884 // We halved the convolution filter values so -1 from the right shift.
885 return vshrn_n_s32(sum, ROUND0_BITS - 1);
886 }
887
convolve4_8_2d_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl,const int32x4_t horiz_const)888 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
889 const int8x8_t filters,
890 const uint8x16x2_t permute_tbl,
891 const int32x4_t horiz_const) {
892 // Permute samples ready for dot product.
893 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
894 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
895 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
896 vqtbl1q_u8(samples, permute_tbl.val[1]) };
897
898 int32x4_t sum0123 =
899 vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
900 int32x4_t sum4567 =
901 vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
902
903 // Narrow and re-pack.
904 // We halved the filter values so -1 from right shift.
905 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
906 vshrn_n_s32(sum4567, ROUND0_BITS - 1));
907 }
908
convolve_2d_sr_horiz_4tap_neon_i8mm(const uint8_t * src,int src_stride,int16_t * dst,int dst_stride,int width,int height,const int16_t * filter_x)909 static inline void convolve_2d_sr_horiz_4tap_neon_i8mm(
910 const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width,
911 int height, const int16_t *filter_x) {
912 const int bd = 8;
913 const int16x4_t x_filter = vld1_s16(filter_x + 2);
914 // All 4-tap and bilinear filter values are even, so halve them to reduce
915 // intermediate precision requirements.
916 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
917
918 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
919 // shifts - which are generally faster than rounding shifts on modern CPUs.
920 // Halve the total because we halved the filter values.
921 const int32x4_t horiz_const = vdupq_n_s32(
922 (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2));
923
924 if (width == 4) {
925 const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
926 do {
927 uint8x16_t s0, s1, s2, s3;
928 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
929
930 int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
931 int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const);
932 int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const);
933 int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const);
934
935 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
936
937 src += 4 * src_stride;
938 dst += 4 * dst_stride;
939 height -= 4;
940 } while (height > 4);
941
942 do {
943 uint8x16_t s0 = vld1q_u8(src);
944 int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
945 vst1_s16(dst, d0);
946
947 src += src_stride;
948 dst += dst_stride;
949 } while (--height != 0);
950 } else {
951 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
952 do {
953 int w = width;
954 const uint8_t *s = src;
955 int16_t *d = dst;
956
957 do {
958 uint8x16_t s0, s1, s2, s3;
959 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
960
961 int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
962 int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const);
963 int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const);
964 int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const);
965
966 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
967
968 s += 8;
969 d += 8;
970 w -= 8;
971 } while (w != 0);
972 src += 4 * src_stride;
973 dst += 4 * dst_stride;
974 height -= 4;
975 } while (height > 4);
976
977 do {
978 const uint8_t *s = src;
979 int16_t *d = dst;
980 int w = width;
981
982 do {
983 uint8x16_t s0 = vld1q_u8(s);
984 int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
985 vst1q_s16(d, d0);
986
987 s += 8;
988 d += 8;
989 w -= 8;
990 } while (w != 0);
991 src += src_stride;
992 dst += dst_stride;
993 } while (--height != 0);
994 }
995 }
996
convolve6_4_2d_h(uint8x16_t samples,const int8x16_t filter,const uint8x16_t permute_tbl,const int32x4_t horiz_const)997 static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples,
998 const int8x16_t filter,
999 const uint8x16_t permute_tbl,
1000 const int32x4_t horiz_const) {
1001 // Permute samples ready for matrix multiply.
1002 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
1003 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
1004
1005 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
1006 // (filter), destructively accumulating into the destination register.
1007 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
1008
1009 // We halved the convolution filter values so -1 from the right shift.
1010 return vshrn_n_s32(sum, ROUND0_BITS - 1);
1011 }
1012
convolve6_8_2d_h(uint8x16_t samples,const int8x16_t filter,const uint8x16x2_t permute_tbl,const int32x4_t horiz_const)1013 static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples,
1014 const int8x16_t filter,
1015 const uint8x16x2_t permute_tbl,
1016 const int32x4_t horiz_const) {
1017 // Permute samples ready for matrix multiply.
1018 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
1019 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }
1020 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
1021 vqtbl1q_u8(samples, permute_tbl.val[1]) };
1022
1023 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
1024 // (filter), destructively accumulating into the destination register.
1025 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
1026 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
1027
1028 // Narrow and re-pack.
1029 // We halved the convolution filter values so -1 from the right shift.
1030 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
1031 vshrn_n_s32(sum4567, ROUND0_BITS - 1));
1032 }
1033
convolve_2d_sr_6tap_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int16_t * y_filter_ptr)1034 static inline void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
1035 int src_stride, uint8_t *dst,
1036 int dst_stride, int w, int h,
1037 const int16_t *x_filter_ptr,
1038 const int16_t *y_filter_ptr) {
1039 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1040 // Filter values are even, so halve to reduce intermediate precision reqs.
1041 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
1042 // Stagger the filter for use with the matrix multiply instructions.
1043 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
1044 const int8x16_t x_filter =
1045 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
1046
1047 const int bd = 8;
1048 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1049 // shifts in convolution kernels - which are generally faster than rounding
1050 // shifts on modern CPUs. The outermost -1 is needed because we halved the
1051 // filter values.
1052 const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
1053 (1 << ((ROUND0_BITS - 1) - 1)));
1054 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
1055 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
1056
1057 do {
1058 const uint8_t *s = src;
1059 uint8_t *d = dst;
1060 int height = h;
1061
1062 uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
1063 load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
1064 s += 5 * src_stride;
1065
1066 int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
1067 int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
1068 int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
1069 int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
1070 int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
1071
1072 do {
1073 uint8x16_t h_s5, h_s6, h_s7, h_s8;
1074 load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
1075
1076 int16x8_t v_s5 =
1077 convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
1078 int16x8_t v_s6 =
1079 convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
1080 int16x8_t v_s7 =
1081 convolve6_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const);
1082 int16x8_t v_s8 =
1083 convolve6_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const);
1084
1085 uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1086 y_filter, vert_const);
1087 uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1088 y_filter, vert_const);
1089 uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1090 y_filter, vert_const);
1091 uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1092 y_filter, vert_const);
1093
1094 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1095
1096 v_s0 = v_s4;
1097 v_s1 = v_s5;
1098 v_s2 = v_s6;
1099 v_s3 = v_s7;
1100 v_s4 = v_s8;
1101
1102 s += 4 * src_stride;
1103 d += 4 * dst_stride;
1104 height -= 4;
1105 } while (height != 0);
1106 src += 8;
1107 dst += 8;
1108 w -= 8;
1109 } while (w != 0);
1110 }
1111
convolve_2d_sr_6tap_4tap_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int16_t * y_filter_ptr)1112 static inline void convolve_2d_sr_6tap_4tap_neon_i8mm(
1113 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
1114 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
1115 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
1116 // Filter values are even, so halve to reduce intermediate precision reqs.
1117 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
1118 // Stagger the filter for use with the matrix multiply instructions.
1119 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
1120 const int8x16_t x_filter =
1121 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
1122
1123 const int bd = 8;
1124 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
1125 // shifts - which are generally faster than rounding shifts on modern CPUs.
1126 // Halve the total because we halved the filter values.
1127 const int32x4_t horiz_const = vdupq_n_s32(
1128 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2);
1129 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
1130
1131 if (w == 4) {
1132 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
1133 uint8x16_t h_s0, h_s1, h_s2;
1134 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
1135
1136 int16x4_t v_s0 = convolve6_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
1137 int16x4_t v_s1 = convolve6_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
1138 int16x4_t v_s2 = convolve6_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
1139
1140 src += 3 * src_stride;
1141
1142 do {
1143 uint8x16_t h_s3, h_s4, h_s5, h_s6;
1144 load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
1145
1146 int16x4_t v_s3 =
1147 convolve6_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
1148 int16x4_t v_s4 =
1149 convolve6_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
1150 int16x4_t v_s5 =
1151 convolve6_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
1152 int16x4_t v_s6 =
1153 convolve6_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
1154
1155 int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
1156 int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
1157 int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
1158 int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
1159
1160 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
1161 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
1162
1163 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
1164 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
1165
1166 v_s0 = v_s4;
1167 v_s1 = v_s5;
1168 v_s2 = v_s6;
1169
1170 src += 4 * src_stride;
1171 dst += 4 * dst_stride;
1172 h -= 4;
1173 } while (h != 0);
1174 } else {
1175 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
1176
1177 do {
1178 int height = h;
1179 const uint8_t *s = src;
1180 uint8_t *d = dst;
1181
1182 uint8x16_t h_s0, h_s1, h_s2;
1183 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
1184
1185 int16x8_t v_s0 =
1186 convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
1187 int16x8_t v_s1 =
1188 convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
1189 int16x8_t v_s2 =
1190 convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
1191
1192 s += 3 * src_stride;
1193
1194 do {
1195 uint8x16_t h_s3, h_s4, h_s5, h_s6;
1196 load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
1197
1198 int16x8_t v_s3 =
1199 convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
1200 int16x8_t v_s4 =
1201 convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
1202 int16x8_t v_s5 =
1203 convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
1204 int16x8_t v_s6 =
1205 convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
1206
1207 uint8x8_t d0 =
1208 convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
1209 uint8x8_t d1 =
1210 convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
1211 uint8x8_t d2 =
1212 convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
1213 uint8x8_t d3 =
1214 convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
1215
1216 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1217
1218 v_s0 = v_s4;
1219 v_s1 = v_s5;
1220 v_s2 = v_s6;
1221
1222 s += 4 * src_stride;
1223 d += 4 * dst_stride;
1224 height -= 4;
1225 } while (height != 0);
1226 src += 8;
1227 dst += 8;
1228 w -= 8;
1229 } while (w != 0);
1230 }
1231 }
1232
av1_convolve_2d_sr_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1233 void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
1234 uint8_t *dst, int dst_stride, int w, int h,
1235 const InterpFilterParams *filter_params_x,
1236 const InterpFilterParams *filter_params_y,
1237 const int subpel_x_qn, const int subpel_y_qn,
1238 ConvolveParams *conv_params) {
1239 if (w == 2 || h == 2) {
1240 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1241 filter_params_x, filter_params_y, subpel_x_qn,
1242 subpel_y_qn, conv_params);
1243 return;
1244 }
1245
1246 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1247 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1248 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1249 const int im_h = h + clamped_y_taps - 1;
1250 const int im_stride = MAX_SB_SIZE;
1251 const int vert_offset = clamped_y_taps / 2 - 1;
1252 const int horiz_offset = filter_params_x->taps / 2 - 1;
1253 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1254
1255 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1256 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1257 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1258 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1259
1260 if (filter_params_x->taps > 8) {
1261 DECLARE_ALIGNED(16, int16_t,
1262 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1263
1264 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1265 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1266
1267 convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
1268 im_stride, w, im_h, x_filter_ptr);
1269
1270 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1271 y_filter_0_7, y_filter_8_11);
1272 } else {
1273 DECLARE_ALIGNED(16, int16_t,
1274 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
1275
1276 if (x_filter_taps == 6 && y_filter_taps == 6) {
1277 convolve_2d_sr_6tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w,
1278 h, x_filter_ptr, y_filter_ptr);
1279 return;
1280 }
1281
1282 // Used for both 6, 4 and 4, 4 horiz, vert filter tap combinations.
1283 if (x_filter_taps <= 6 && y_filter_taps <= 4) {
1284 convolve_2d_sr_6tap_4tap_neon_i8mm(src_ptr + 1, src_stride, dst,
1285 dst_stride, w, h, x_filter_ptr,
1286 y_filter_ptr);
1287 return;
1288 }
1289
1290 if (x_filter_taps <= 4) {
1291 convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block,
1292 im_stride, w, im_h, x_filter_ptr);
1293 } else {
1294 convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block,
1295 im_stride, w, im_h, x_filter_ptr);
1296 }
1297
1298 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1299
1300 if (clamped_y_taps <= 4) {
1301 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1302 y_filter_ptr);
1303 } else if (clamped_y_taps == 6) {
1304 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1305 y_filter);
1306 } else {
1307 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1308 y_filter);
1309 }
1310 }
1311 }
1312