xref: /aosp_15_r20/external/libaom/av1/common/arm/convolve_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  *
3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <assert.h>
14 #include <arm_neon.h>
15 
16 #include "config/aom_config.h"
17 #include "config/av1_rtcd.h"
18 
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/transpose_neon.h"
22 #include "aom_ports/mem.h"
23 #include "av1/common/convolve.h"
24 #include "av1/common/filter.h"
25 #include "av1/common/arm/convolve_neon.h"
26 
convolve12_4_x(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)27 static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
28                                        const int16x4_t s2, const int16x4_t s3,
29                                        const int16x4_t s4, const int16x4_t s5,
30                                        const int16x4_t s6, const int16x4_t s7,
31                                        const int16x4_t s8, const int16x4_t s9,
32                                        const int16x4_t s10, const int16x4_t s11,
33                                        const int16x8_t x_filter_0_7,
34                                        const int16x4_t x_filter_8_11,
35                                        const int32x4_t horiz_const) {
36   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
37   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
38 
39   int32x4_t sum = horiz_const;
40   sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
41   sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
42   sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
43   sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
44   sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
45   sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
46   sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
47   sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
48   sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
49   sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
50   sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
51   sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
52 
53   return vqrshrn_n_s32(sum, FILTER_BITS);
54 }
55 
convolve_x_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)56 static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
57                                             int src_stride, uint8_t *dst_ptr,
58                                             const int dst_stride, int w, int h,
59                                             const int16_t *x_filter_ptr) {
60   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
61   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
62 
63   // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
64   // shift by FILTER_BITS - instead of a first rounding right shift by
65   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
66   // ROUND0_BITS.
67   const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
68 
69 #if AOM_ARCH_AARCH64
70   do {
71     const uint8_t *s = src_ptr;
72     uint8_t *d = dst_ptr;
73     int width = w;
74 
75     uint8x8_t t0, t1, t2, t3;
76     load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
77     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
78 
79     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
80     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
81     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
82     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
83     int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
84     int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
85     int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
86     int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
87 
88     load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
89     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
90 
91     int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
92     int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
93     int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
94 
95     s += 11;
96 
97     do {
98       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
99       transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
100 
101       int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
102       int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
103       int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
104       int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
105 
106       int16x4_t d0 =
107           convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
108                          x_filter_0_7, x_filter_8_11, horiz_const);
109       int16x4_t d1 =
110           convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
111                          x_filter_0_7, x_filter_8_11, horiz_const);
112       int16x4_t d2 =
113           convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
114                          x_filter_0_7, x_filter_8_11, horiz_const);
115       int16x4_t d3 =
116           convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
117                          x_filter_0_7, x_filter_8_11, horiz_const);
118 
119       transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
120 
121       uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
122       uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
123 
124       store_u8x4_strided_x2(d, dst_stride, d01);
125       store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
126 
127       s0 = s4;
128       s1 = s5;
129       s2 = s6;
130       s3 = s7;
131       s4 = s8;
132       s5 = s9;
133       s6 = s10;
134       s7 = s11;
135       s8 = s12;
136       s9 = s13;
137       s10 = s14;
138       s += 4;
139       d += 4;
140       width -= 4;
141     } while (width != 0);
142     src_ptr += 4 * src_stride;
143     dst_ptr += 4 * dst_stride;
144     h -= 4;
145   } while (h != 0);
146 
147 #else   // !AOM_ARCH_AARCH64
148   do {
149     const uint8_t *s = src_ptr;
150     uint8_t *d = dst_ptr;
151     int width = w;
152 
153     do {
154       uint8x16_t t0 = vld1q_u8(s);
155       int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
156       int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
157 
158       int16x4_t s0 = vget_low_s16(tt0);
159       int16x4_t s4 = vget_high_s16(tt0);
160       int16x4_t s8 = vget_low_s16(tt8);
161       int16x4_t s12 = vget_high_s16(tt8);
162 
163       int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
164       int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
165       int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
166       int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
167       int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
168       int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
169       int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
170       int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
171       int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
172 
173       int16x4_t d0 =
174           convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
175                          x_filter_0_7, x_filter_8_11, horiz_const);
176 
177       uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
178 
179       store_u8_4x1(d, dd0);
180 
181       s += 4;
182       d += 4;
183       width -= 4;
184     } while (width != 0);
185     src_ptr += src_stride;
186     dst_ptr += dst_stride;
187   } while (--h != 0);
188 #endif  // AOM_ARCH_AARCH64
189 }
190 
convolve4_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,int16x8_t horiz_const)191 static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
192                                       const int16x8_t s2, const int16x8_t s3,
193                                       const int16x4_t filter,
194                                       int16x8_t horiz_const) {
195   int16x8_t sum = horiz_const;
196   sum = vmlaq_lane_s16(sum, s0, filter, 0);
197   sum = vmlaq_lane_s16(sum, s1, filter, 1);
198   sum = vmlaq_lane_s16(sum, s2, filter, 2);
199   sum = vmlaq_lane_s16(sum, s3, filter, 3);
200   // We halved the filter values so -1 from right shift.
201   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
202 }
203 
convolve_x_sr_4tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)204 static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
205                                            int src_stride, uint8_t *dst_ptr,
206                                            const int dst_stride, int w, int h,
207                                            const int16_t *x_filter_ptr) {
208   // All filter values are even, halve to reduce intermediate precision
209   // requirements.
210   const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
211 
212   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
213   // rounding right shift by FILTER_BITS - instead of a first rounding right
214   // shift by ROUND0_BITS, followed by second rounding right shift by
215   // FILTER_BITS - ROUND0_BITS.
216   // The outermost -1 is needed because we will halve the filter values.
217   const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
218 
219   if (w == 4) {
220     do {
221       uint8x8_t t01[4];
222       t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
223       t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
224       t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
225       t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
226 
227       int16x8_t s01[4];
228       s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
229       s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
230       s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
231       s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
232 
233       uint8x8_t d01 =
234           convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
235 
236       store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
237 
238       src_ptr += 2 * src_stride;
239       dst_ptr += 2 * dst_stride;
240       h -= 2;
241     } while (h != 0);
242   } else {
243     do {
244       int width = w;
245       const uint8_t *s = src_ptr;
246       uint8_t *d = dst_ptr;
247 
248       do {
249         uint8x8_t t0[4], t1[4];
250         load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
251         load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
252 
253         int16x8_t s0[4], s1[4];
254         s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
255         s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
256         s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
257         s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
258 
259         s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
260         s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
261         s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
262         s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
263 
264         uint8x8_t d0 =
265             convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
266         uint8x8_t d1 =
267             convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
268 
269         store_u8_8x2(d, dst_stride, d0, d1);
270 
271         s += 8;
272         d += 8;
273         width -= 8;
274       } while (width != 0);
275       src_ptr += 2 * src_stride;
276       dst_ptr += 2 * dst_stride;
277       h -= 2;
278     } while (h != 0);
279   }
280 }
281 
convolve8_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)282 static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
283                                       const int16x8_t s2, const int16x8_t s3,
284                                       const int16x8_t s4, const int16x8_t s5,
285                                       const int16x8_t s6, const int16x8_t s7,
286                                       const int16x8_t filter,
287                                       const int16x8_t horiz_const) {
288   const int16x4_t filter_lo = vget_low_s16(filter);
289   const int16x4_t filter_hi = vget_high_s16(filter);
290 
291   int16x8_t sum = horiz_const;
292   sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
293   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
294   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
295   sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
296   sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
297   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
298   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
299   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
300 
301   // We halved the convolution filter values so - 1 from the right shift.
302   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
303 }
304 
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)305 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
306                             int dst_stride, int w, int h,
307                             const InterpFilterParams *filter_params_x,
308                             const int subpel_x_qn,
309                             ConvolveParams *conv_params) {
310   if (w == 2 || h == 2) {
311     av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
312                         subpel_x_qn, conv_params);
313     return;
314   }
315 
316   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
317   src -= horiz_offset;
318 
319   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
320       filter_params_x, subpel_x_qn & SUBPEL_MASK);
321 
322   int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
323 
324   if (filter_taps > 8) {
325     convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
326                              x_filter_ptr);
327     return;
328   }
329 
330   if (filter_taps <= 4) {
331     convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
332                             x_filter_ptr);
333     return;
334   }
335 
336   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
337   // rounding right shift by FILTER_BITS - instead of a first rounding right
338   // shift by ROUND0_BITS, followed by second rounding right shift by
339   // FILTER_BITS - ROUND0_BITS.
340   // The outermost -1 is needed because we will halve the filter values.
341   const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
342 
343   // Filter values are even so halve to reduce precision requirements.
344   const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
345 
346 #if AOM_ARCH_AARCH64
347   while (h >= 8) {
348     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
349     load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
350 
351     transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
352     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
353     int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
354     int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
355     int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
356     int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
357     int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
358     int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
359 
360     int width = w;
361     const uint8_t *s = src + 7;
362     uint8_t *d = dst;
363 
364     __builtin_prefetch(d + 0 * dst_stride);
365     __builtin_prefetch(d + 1 * dst_stride);
366     __builtin_prefetch(d + 2 * dst_stride);
367     __builtin_prefetch(d + 3 * dst_stride);
368     __builtin_prefetch(d + 4 * dst_stride);
369     __builtin_prefetch(d + 5 * dst_stride);
370     __builtin_prefetch(d + 6 * dst_stride);
371     __builtin_prefetch(d + 7 * dst_stride);
372 
373     do {
374       uint8x8_t t8, t9, t10, t11, t12, t13, t14;
375       load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
376 
377       transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
378                                      &t14);
379       int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
380       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
381       int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
382       int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
383       int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
384       int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
385       int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
386       int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
387 
388       uint8x8_t d0 =
389           convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
390       uint8x8_t d1 =
391           convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
392       uint8x8_t d2 =
393           convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
394       uint8x8_t d3 =
395           convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
396       uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
397                                    horiz_const);
398       uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
399                                    horiz_const);
400       uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
401                                    horiz_const);
402       uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
403                                    x_filter, horiz_const);
404 
405       transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
406 
407       store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
408 
409       s0 = s8;
410       s1 = s9;
411       s2 = s10;
412       s3 = s11;
413       s4 = s12;
414       s5 = s13;
415       s6 = s14;
416       s += 8;
417       d += 8;
418       width -= 8;
419     } while (width != 0);
420     src += 8 * src_stride;
421     dst += 8 * dst_stride;
422     h -= 8;
423   }
424 #endif  // AOM_ARCH_AARCH64
425 
426   while (h-- != 0) {
427     uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
428     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
429 
430     int width = w;
431     const uint8_t *s = src + 8;
432     uint8_t *d = dst;
433 
434     __builtin_prefetch(d);
435 
436     do {
437       uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
438       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
439 
440       int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
441       int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
442       int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
443       int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
444       int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
445       int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
446       int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
447 
448       uint8x8_t d0 =
449           convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
450 
451       vst1_u8(d, d0);
452 
453       s0 = s8;
454       s += 8;
455       d += 8;
456       width -= 8;
457     } while (width != 0);
458     src += src_stride;
459     dst += dst_stride;
460   }
461 }
462 
convolve4_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter)463 static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
464                                       const int16x8_t s2, const int16x8_t s3,
465                                       const int16x4_t filter) {
466   int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
467   sum = vmlaq_lane_s16(sum, s1, filter, 1);
468   sum = vmlaq_lane_s16(sum, s2, filter, 2);
469   sum = vmlaq_lane_s16(sum, s3, filter, 3);
470 
471   // We halved the filter values so -1 from right shift.
472   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
473 }
474 
convolve_y_sr_4tap_neon(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,int w,int h,const int16_t * filter_y)475 static inline void convolve_y_sr_4tap_neon(const uint8_t *src,
476                                            const int src_stride, uint8_t *dst,
477                                            const int dst_stride, int w, int h,
478                                            const int16_t *filter_y) {
479   // All filter values are even, halve to reduce intermediate precision
480   // requirements.
481   const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
482 
483   if (w == 4) {
484     uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
485     uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
486 
487     int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
488     int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
489 
490     src += 2 * src_stride;
491 
492     do {
493       uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
494       uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
495       uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
496       uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
497 
498       int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
499       int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
500       int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
501       int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
502 
503       uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
504       uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
505 
506       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
507       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
508 
509       s01 = s45;
510       s12 = s56;
511 
512       src += 4 * src_stride;
513       dst += 4 * dst_stride;
514       h -= 4;
515     } while (h != 0);
516   } else {
517     do {
518       uint8x8_t t0, t1, t2;
519       load_u8_8x3(src, src_stride, &t0, &t1, &t2);
520 
521       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
522       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
523       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
524 
525       int height = h;
526       const uint8_t *s = src + 3 * src_stride;
527       uint8_t *d = dst;
528 
529       do {
530         uint8x8_t t3;
531         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
532 
533         int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
534         int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
535         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
536         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
537 
538         uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
539         uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
540         uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
541         uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
542 
543         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
544 
545         s0 = s4;
546         s1 = s5;
547         s2 = s6;
548 
549         s += 4 * src_stride;
550         d += 4 * dst_stride;
551         height -= 4;
552       } while (height != 0);
553       src += 8;
554       dst += 8;
555       w -= 8;
556     } while (w != 0);
557   }
558 }
559 
convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter_0_7)560 static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
561                                       const int16x4_t s2, const int16x4_t s3,
562                                       const int16x4_t s4, const int16x4_t s5,
563                                       const int16x8_t y_filter_0_7) {
564   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
565   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
566 
567   // Filter values at indices 0 and 7 are 0.
568   int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
569   sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
570   sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
571   sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
572   sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
573   sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
574 
575   return sum;
576 }
577 
convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filters)578 static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
579                                       const int16x8_t s2, const int16x8_t s3,
580                                       const int16x8_t s4, const int16x8_t s5,
581                                       const int16x8_t y_filters) {
582   const int16x4_t y_filter_lo = vget_low_s16(y_filters);
583   const int16x4_t y_filter_hi = vget_high_s16(y_filters);
584 
585   // Filter values at indices 0 and 7 are 0.
586   int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
587   sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
588   sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
589   sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
590   sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
591   sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
592   // We halved the convolution filter values so -1 from the right shift.
593   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
594 }
595 
convolve_y_sr_6tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)596 static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
597                                            int src_stride, uint8_t *dst_ptr,
598                                            const int dst_stride, int w, int h,
599                                            const int16x8_t y_filter) {
600   if (w <= 4) {
601     uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
602     uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
603     uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
604     uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
605     uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
606 
607     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
608     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
609     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
610     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
611     int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
612 
613     src_ptr += 5 * src_stride;
614 
615     do {
616 #if AOM_ARCH_AARCH64
617       uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
618       uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
619       uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
620       uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
621 
622       int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
623       int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
624       int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
625       int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
626 
627       int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
628       int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
629       int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
630       int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
631 
632       // We halved the convolution filter values so -1 from the right shift.
633       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
634       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
635 
636       store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
637       store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
638 
639       s0 = s4;
640       s1 = s5;
641       s2 = s6;
642       s3 = s7;
643       s4 = s8;
644       src_ptr += 4 * src_stride;
645       dst_ptr += 4 * dst_stride;
646       h -= 4;
647 #else   // !AOM_ARCH_AARCH64
648       uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
649       int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
650 
651       int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
652       // We halved the convolution filter values so -1 from the right shift.
653       uint8x8_t d01 =
654           vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
655 
656       store_u8_4x1(dst_ptr, d01);
657 
658       s0 = s1;
659       s1 = s2;
660       s2 = s3;
661       s3 = s4;
662       s4 = s5;
663       src_ptr += src_stride;
664       dst_ptr += dst_stride;
665       h--;
666 #endif  // AOM_ARCH_AARCH64
667     } while (h != 0);
668 
669   } else {
670     do {
671       const uint8_t *s = src_ptr;
672       uint8_t *d = dst_ptr;
673       int height = h;
674 
675       uint8x8_t t0, t1, t2, t3, t4;
676       load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
677 
678       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
679       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
680       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
681       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
682       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
683 
684       s += 5 * src_stride;
685 
686       do {
687 #if AOM_ARCH_AARCH64
688         uint8x8_t t5, t6, t7, t8;
689         load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
690 
691         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
692         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
693         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
694         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
695 
696         uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
697         uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
698         uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
699         uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
700 
701         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
702 
703         s0 = s4;
704         s1 = s5;
705         s2 = s6;
706         s3 = s7;
707         s4 = s8;
708         s += 4 * src_stride;
709         d += 4 * dst_stride;
710         height -= 4;
711 #else   // !AOM_ARCH_AARCH64
712         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
713 
714         uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
715 
716         vst1_u8(d, d0);
717 
718         s0 = s1;
719         s1 = s2;
720         s2 = s3;
721         s3 = s4;
722         s4 = s5;
723         s += src_stride;
724         d += dst_stride;
725         height--;
726 #endif  // AOM_ARCH_AARCH64
727       } while (height != 0);
728       src_ptr += 8;
729       dst_ptr += 8;
730       w -= 8;
731     } while (w != 0);
732   }
733 }
734 
convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter)735 static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
736                                       const int16x4_t s2, const int16x4_t s3,
737                                       const int16x4_t s4, const int16x4_t s5,
738                                       const int16x4_t s6, const int16x4_t s7,
739                                       const int16x8_t filter) {
740   const int16x4_t filter_lo = vget_low_s16(filter);
741   const int16x4_t filter_hi = vget_high_s16(filter);
742 
743   int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
744   sum = vmla_lane_s16(sum, s1, filter_lo, 1);
745   sum = vmla_lane_s16(sum, s2, filter_lo, 2);
746   sum = vmla_lane_s16(sum, s3, filter_lo, 3);
747   sum = vmla_lane_s16(sum, s4, filter_hi, 0);
748   sum = vmla_lane_s16(sum, s5, filter_hi, 1);
749   sum = vmla_lane_s16(sum, s6, filter_hi, 2);
750   sum = vmla_lane_s16(sum, s7, filter_hi, 3);
751 
752   return sum;
753 }
754 
convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter)755 static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
756                                       const int16x8_t s2, const int16x8_t s3,
757                                       const int16x8_t s4, const int16x8_t s5,
758                                       const int16x8_t s6, const int16x8_t s7,
759                                       const int16x8_t filter) {
760   const int16x4_t filter_lo = vget_low_s16(filter);
761   const int16x4_t filter_hi = vget_high_s16(filter);
762 
763   int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
764   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
765   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
766   sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
767   sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
768   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
769   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
770   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
771 
772   // We halved the convolution filter values so -1 from the right shift.
773   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
774 }
775 
convolve_y_sr_8tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)776 static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
777                                            int src_stride, uint8_t *dst_ptr,
778                                            const int dst_stride, int w, int h,
779                                            const int16x8_t y_filter) {
780   if (w <= 4) {
781     uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
782     uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
783     uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
784     uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
785     uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
786     uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
787     uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
788 
789     int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
790     int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
791     int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
792     int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
793     int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
794     int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
795     int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
796 
797     src_ptr += 7 * src_stride;
798 
799     do {
800 #if AOM_ARCH_AARCH64
801       uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
802       uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
803       uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
804       uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
805 
806       int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
807       int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
808       int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
809       int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
810 
811       int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
812       int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
813       int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
814       int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
815 
816       // We halved the convolution filter values so -1 from the right shift.
817       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
818       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
819 
820       store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
821       store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
822 
823       s0 = s4;
824       s1 = s5;
825       s2 = s6;
826       s3 = s7;
827       s4 = s8;
828       s5 = s9;
829       s6 = s10;
830       src_ptr += 4 * src_stride;
831       dst_ptr += 4 * dst_stride;
832       h -= 4;
833 #else   // !AOM_ARCH_AARCH64
834       uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
835       int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
836 
837       int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
838       // We halved the convolution filter values so -1 from the right shift.
839       uint8x8_t d01 =
840           vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
841 
842       store_u8_4x1(dst_ptr, d01);
843 
844       s0 = s1;
845       s1 = s2;
846       s2 = s3;
847       s3 = s4;
848       s4 = s5;
849       s5 = s6;
850       s6 = s7;
851       src_ptr += src_stride;
852       dst_ptr += dst_stride;
853       h--;
854 #endif  // AOM_ARCH_AARCH64
855     } while (h != 0);
856   } else {
857     do {
858       const uint8_t *s = src_ptr;
859       uint8_t *d = dst_ptr;
860       int height = h;
861 
862       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
863       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
864 
865       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
866       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
867       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
868       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
869       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
870       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
871       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
872 
873       s += 7 * src_stride;
874 
875       do {
876 #if AOM_ARCH_AARCH64
877         uint8x8_t t7, t8, t9, t10;
878         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
879 
880         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
881         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
882         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
883         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
884 
885         uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
886         uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
887         uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
888         uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
889 
890         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
891 
892         s0 = s4;
893         s1 = s5;
894         s2 = s6;
895         s3 = s7;
896         s4 = s8;
897         s5 = s9;
898         s6 = s10;
899         s += 4 * src_stride;
900         d += 4 * dst_stride;
901         height -= 4;
902 #else   // !AOM_ARCH_AARCH64
903         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
904 
905         uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
906 
907         vst1_u8(d, d0);
908 
909         s0 = s1;
910         s1 = s2;
911         s2 = s3;
912         s3 = s4;
913         s4 = s5;
914         s5 = s6;
915         s6 = s7;
916         s += src_stride;
917         d += dst_stride;
918         height--;
919 #endif  // AOM_ARCH_AARCH64
920       } while (height != 0);
921       src_ptr += 8;
922       dst_ptr += 8;
923       w -= 8;
924     } while (w != 0);
925   }
926 }
927 
convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)928 static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
929                                        const int16x4_t s2, const int16x4_t s3,
930                                        const int16x4_t s4, const int16x4_t s5,
931                                        const int16x4_t s6, const int16x4_t s7,
932                                        const int16x4_t s8, const int16x4_t s9,
933                                        const int16x4_t s10, const int16x4_t s11,
934                                        const int16x8_t y_filter_0_7,
935                                        const int16x4_t y_filter_8_11) {
936   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
937   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
938   int16x4_t sum;
939 
940   sum = vmul_lane_s16(s0, y_filter_0_3, 0);
941   sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
942   sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
943   sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
944   sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
945 
946   sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
947   sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0);
948   sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1);
949   sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
950   sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
951 
952   // Saturating addition is required for the largest filter taps to avoid
953   // overflow (while staying in 16-bit elements.)
954   sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
955   sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
956 
957   return sum;
958 }
959 
convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)960 static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
961                                        const int16x8_t s2, const int16x8_t s3,
962                                        const int16x8_t s4, const int16x8_t s5,
963                                        const int16x8_t s6, const int16x8_t s7,
964                                        const int16x8_t s8, const int16x8_t s9,
965                                        const int16x8_t s10, const int16x8_t s11,
966                                        const int16x8_t y_filter_0_7,
967                                        const int16x4_t y_filter_8_11) {
968   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
969   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
970   int16x8_t sum;
971 
972   sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
973   sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
974   sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
975   sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
976   sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
977 
978   sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
979   sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0);
980   sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1);
981   sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
982   sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
983 
984   // Saturating addition is required for the largest filter taps to avoid
985   // overflow (while staying in 16-bit elements.)
986   sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
987   sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
988 
989   return vqrshrun_n_s16(sum, FILTER_BITS);
990 }
991 
convolve_y_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)992 static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
993                                             int src_stride, uint8_t *dst_ptr,
994                                             int dst_stride, int w, int h,
995                                             const int16_t *y_filter_ptr) {
996   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
997   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
998 
999   if (w <= 4) {
1000     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1001     load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
1002                  &t8, &t9, &t10);
1003     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1004     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1005     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1006     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1007     int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
1008     int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
1009     int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
1010     int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
1011     int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
1012     int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
1013     int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
1014 
1015     src_ptr += 11 * src_stride;
1016 
1017     do {
1018       uint8x8_t t11, t12, t13, t14;
1019       load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
1020 
1021       int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
1022       int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
1023       int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
1024       int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
1025 
1026       int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1027                                     s11, y_filter_0_7, y_filter_8_11);
1028       int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1029                                     s11, s12, y_filter_0_7, y_filter_8_11);
1030       int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1031                                     s12, s13, y_filter_0_7, y_filter_8_11);
1032       int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1033                                     s13, s14, y_filter_0_7, y_filter_8_11);
1034 
1035       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
1036       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
1037 
1038       store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
1039       store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
1040 
1041       s0 = s4;
1042       s1 = s5;
1043       s2 = s6;
1044       s3 = s7;
1045       s4 = s8;
1046       s5 = s9;
1047       s6 = s10;
1048       s7 = s11;
1049       s8 = s12;
1050       s9 = s13;
1051       s10 = s14;
1052       src_ptr += 4 * src_stride;
1053       dst_ptr += 4 * dst_stride;
1054       h -= 4;
1055     } while (h != 0);
1056 
1057   } else {
1058     do {
1059       const uint8_t *s = src_ptr;
1060       uint8_t *d = dst_ptr;
1061       int height = h;
1062 
1063       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1064       load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
1065                    &t9, &t10);
1066       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1067       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1068       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1069       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1070       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1071       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1072       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1073       int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
1074       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
1075       int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
1076       int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
1077 
1078       s += 11 * src_stride;
1079 
1080       do {
1081         uint8x8_t t11, t12, t13, t14;
1082         load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
1083 
1084         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
1085         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
1086         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
1087         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
1088 
1089         uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
1090                                       s10, s11, y_filter_0_7, y_filter_8_11);
1091         uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1092                                       s11, s12, y_filter_0_7, y_filter_8_11);
1093         uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1094                                       s12, s13, y_filter_0_7, y_filter_8_11);
1095         uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1096                                       s13, s14, y_filter_0_7, y_filter_8_11);
1097 
1098         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1099 
1100         s0 = s4;
1101         s1 = s5;
1102         s2 = s6;
1103         s3 = s7;
1104         s4 = s8;
1105         s5 = s9;
1106         s6 = s10;
1107         s7 = s11;
1108         s8 = s12;
1109         s9 = s13;
1110         s10 = s14;
1111         s += 4 * src_stride;
1112         d += 4 * dst_stride;
1113         height -= 4;
1114       } while (height != 0);
1115       src_ptr += 8;
1116       dst_ptr += 8;
1117       w -= 8;
1118     } while (w != 0);
1119   }
1120 }
1121 
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1122 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1123                             int dst_stride, int w, int h,
1124                             const InterpFilterParams *filter_params_y,
1125                             const int subpel_y_qn) {
1126   if (w == 2 || h == 2) {
1127     av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
1128                         subpel_y_qn);
1129     return;
1130   }
1131 
1132   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1133   const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1134   const int vert_offset = clamped_y_taps / 2 - 1;
1135 
1136   src -= vert_offset * src_stride;
1137 
1138   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1139       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1140 
1141   if (y_filter_taps > 8) {
1142     convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
1143                              y_filter_ptr);
1144     return;
1145   }
1146 
1147   // Filter values are even so halve to reduce precision requirements.
1148   const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
1149 
1150   if (y_filter_taps <= 4) {
1151     convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
1152                             y_filter_ptr);
1153   } else if (y_filter_taps == 6) {
1154     convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1155   } else {
1156     convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1157   }
1158 }
1159 
convolve12_4_2d_h(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)1160 static inline int16x4_t convolve12_4_2d_h(
1161     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1162     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1163     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1164     const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1165     const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
1166     const int32x4_t horiz_const) {
1167   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1168   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1169 
1170   int32x4_t sum = horiz_const;
1171   sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
1172   sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
1173   sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
1174   sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
1175   sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
1176   sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
1177   sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
1178   sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
1179   sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
1180   sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
1181   sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
1182   sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
1183 
1184   return vshrn_n_s32(sum, ROUND0_BITS);
1185 }
1186 
convolve_2d_sr_horiz_12tap_neon(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11)1187 static inline void convolve_2d_sr_horiz_12tap_neon(
1188     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
1189     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
1190     const int16x4_t x_filter_8_11) {
1191   const int bd = 8;
1192   // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
1193   // which are generally faster than rounding shifts on modern CPUs.
1194   const int32x4_t horiz_const =
1195       vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1196 
1197 #if AOM_ARCH_AARCH64
1198   do {
1199     const uint8_t *s = src_ptr;
1200     int16_t *d = dst_ptr;
1201     int width = w;
1202 
1203     uint8x8_t t0, t1, t2, t3;
1204     load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1205     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1206 
1207     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1208     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1209     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1210     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1211     int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1212     int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1213     int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1214     int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1215 
1216     load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
1217     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1218 
1219     int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1220     int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1221     int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1222 
1223     s += 11;
1224 
1225     do {
1226       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1227       transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1228 
1229       int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1230       int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1231       int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1232       int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1233 
1234       int16x4_t d0 =
1235           convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1236                             x_filter_0_7, x_filter_8_11, horiz_const);
1237       int16x4_t d1 =
1238           convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1239                             x_filter_0_7, x_filter_8_11, horiz_const);
1240       int16x4_t d2 =
1241           convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
1242                             x_filter_0_7, x_filter_8_11, horiz_const);
1243       int16x4_t d3 =
1244           convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
1245                             x_filter_0_7, x_filter_8_11, horiz_const);
1246 
1247       transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
1248       store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
1249 
1250       s0 = s4;
1251       s1 = s5;
1252       s2 = s6;
1253       s3 = s7;
1254       s4 = s8;
1255       s5 = s9;
1256       s6 = s10;
1257       s7 = s11;
1258       s8 = s12;
1259       s9 = s13;
1260       s10 = s14;
1261       s += 4;
1262       d += 4;
1263       width -= 4;
1264     } while (width != 0);
1265     src_ptr += 4 * src_stride;
1266     dst_ptr += 4 * dst_stride;
1267     h -= 4;
1268   } while (h > 4);
1269 #endif  // AOM_ARCH_AARCH64
1270 
1271   do {
1272     const uint8_t *s = src_ptr;
1273     int16_t *d = dst_ptr;
1274     int width = w;
1275 
1276     do {
1277       uint8x16_t t0 = vld1q_u8(s);
1278       int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
1279       int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
1280 
1281       int16x4_t s0 = vget_low_s16(tt0);
1282       int16x4_t s4 = vget_high_s16(tt0);
1283       int16x4_t s8 = vget_low_s16(tt1);
1284       int16x4_t s12 = vget_high_s16(tt1);
1285 
1286       int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
1287       int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
1288       int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
1289       int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
1290       int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
1291       int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
1292       int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
1293       int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
1294       int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
1295 
1296       int16x4_t d0 =
1297           convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1298                             x_filter_0_7, x_filter_8_11, horiz_const);
1299       vst1_s16(d, d0);
1300 
1301       s += 4;
1302       d += 4;
1303       width -= 4;
1304     } while (width != 0);
1305     src_ptr += src_stride;
1306     dst_ptr += dst_stride;
1307   } while (--h != 0);
1308 }
1309 
convolve4_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int16x8_t horiz_const)1310 static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1311                                          const int16x8_t s2, const int16x8_t s3,
1312                                          const int16x4_t filter,
1313                                          const int16x8_t horiz_const) {
1314   int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
1315   sum = vmlaq_lane_s16(sum, s1, filter, 1);
1316   sum = vmlaq_lane_s16(sum, s2, filter, 2);
1317   sum = vmlaq_lane_s16(sum, s3, filter, 3);
1318   // We halved the filter values so -1 from right shift.
1319   return vshrq_n_s16(sum, ROUND0_BITS - 1);
1320 }
1321 
convolve_2d_sr_horiz_4tap_neon(const uint8_t * src,ptrdiff_t src_stride,int16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16_t * filter_x)1322 static inline void convolve_2d_sr_horiz_4tap_neon(
1323     const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
1324     ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
1325   const int bd = 8;
1326   // All filter values are even, halve to reduce intermediate precision
1327   // requirements.
1328   const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
1329 
1330   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1331   // shifts - which are generally faster than rounding shifts on modern CPUs.
1332   // (The extra -1 is needed because we halved the filter values.)
1333   const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1334                                             (1 << ((ROUND0_BITS - 1) - 1)));
1335 
1336   if (w == 4) {
1337     do {
1338       uint8x8_t t01[4];
1339       t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
1340       t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
1341       t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
1342       t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
1343 
1344       int16x8_t s01[4];
1345       s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
1346       s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
1347       s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
1348       s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
1349 
1350       int16x8_t d01 =
1351           convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
1352 
1353       store_s16x4_strided_x2(dst, (int)dst_stride, d01);
1354 
1355       src += 2 * src_stride;
1356       dst += 2 * dst_stride;
1357       h -= 2;
1358     } while (h > 0);
1359   } else {
1360     do {
1361       int width = w;
1362       const uint8_t *s = src;
1363       int16_t *d = dst;
1364 
1365       do {
1366         uint8x8_t t0[4], t1[4];
1367         load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1368         load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
1369 
1370         int16x8_t s0[4];
1371         s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1372         s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1373         s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1374         s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1375 
1376         int16x8_t s1[4];
1377         s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
1378         s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
1379         s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
1380         s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
1381 
1382         int16x8_t d0 =
1383             convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1384         int16x8_t d1 =
1385             convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
1386 
1387         store_s16_8x2(d, dst_stride, d0, d1);
1388 
1389         s += 8;
1390         d += 8;
1391         width -= 8;
1392       } while (width != 0);
1393       src += 2 * src_stride;
1394       dst += 2 * dst_stride;
1395       h -= 2;
1396     } while (h > 2);
1397 
1398     do {
1399       const uint8_t *s = src;
1400       int16_t *d = dst;
1401       int width = w;
1402 
1403       do {
1404         uint8x8_t t0[4];
1405         load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1406 
1407         int16x8_t s0[4];
1408         s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1409         s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1410         s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1411         s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1412 
1413         int16x8_t d0 =
1414             convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1415 
1416         vst1q_s16(d, d0);
1417 
1418         s += 8;
1419         d += 8;
1420         width -= 8;
1421       } while (width != 0);
1422       src += src_stride;
1423       dst += dst_stride;
1424     } while (--h != 0);
1425   }
1426 }
1427 
convolve8_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)1428 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1429                                          const int16x8_t s2, const int16x8_t s3,
1430                                          const int16x8_t s4, const int16x8_t s5,
1431                                          const int16x8_t s6, const int16x8_t s7,
1432                                          const int16x8_t filter,
1433                                          const int16x8_t horiz_const) {
1434   const int16x4_t filter_lo = vget_low_s16(filter);
1435   const int16x4_t filter_hi = vget_high_s16(filter);
1436 
1437   int16x8_t sum = horiz_const;
1438   sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
1439   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
1440   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
1441   sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
1442   sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
1443   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
1444   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
1445   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
1446 
1447   // We halved the convolution filter values so -1 from the right shift.
1448   return vshrq_n_s16(sum, ROUND0_BITS - 1);
1449 }
1450 
convolve_2d_sr_horiz_8tap_neon(const uint8_t * src,int src_stride,int16_t * im_block,int im_stride,int w,int im_h,const int16_t * x_filter_ptr)1451 static inline void convolve_2d_sr_horiz_8tap_neon(
1452     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
1453     int im_h, const int16_t *x_filter_ptr) {
1454   const int bd = 8;
1455 
1456   const uint8_t *src_ptr = src;
1457   int16_t *dst_ptr = im_block;
1458   int dst_stride = im_stride;
1459   int height = im_h;
1460 
1461   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1462   // shifts - which are generally faster than rounding shifts on modern CPUs.
1463   // (The extra -1 is needed because we halved the filter values.)
1464   const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1465                                             (1 << ((ROUND0_BITS - 1) - 1)));
1466   // Filter values are even, so halve to reduce intermediate precision reqs.
1467   const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
1468 
1469 #if AOM_ARCH_AARCH64
1470   while (height > 8) {
1471     const uint8_t *s = src_ptr;
1472     int16_t *d = dst_ptr;
1473     int width = w;
1474 
1475     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
1476     load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1477     transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1478 
1479     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1480     int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1481     int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1482     int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1483     int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1484     int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1485     int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1486 
1487     s += 7;
1488 
1489     do {
1490       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1491 
1492       transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1493 
1494       int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1495       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1496       int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1497       int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1498       int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1499       int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1500       int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1501       int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1502 
1503       int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1504                                       horiz_const);
1505       int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
1506                                       horiz_const);
1507       int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
1508                                       horiz_const);
1509       int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
1510                                       horiz_const);
1511       int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
1512                                       x_filter, horiz_const);
1513       int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
1514                                       x_filter, horiz_const);
1515       int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
1516                                       x_filter, horiz_const);
1517       int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
1518                                       x_filter, horiz_const);
1519 
1520       transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
1521 
1522       store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1523 
1524       s0 = s8;
1525       s1 = s9;
1526       s2 = s10;
1527       s3 = s11;
1528       s4 = s12;
1529       s5 = s13;
1530       s6 = s14;
1531       s += 8;
1532       d += 8;
1533       width -= 8;
1534     } while (width != 0);
1535     src_ptr += 8 * src_stride;
1536     dst_ptr += 8 * dst_stride;
1537     height -= 8;
1538   }
1539 #endif  // AOM_ARCH_AARCH64
1540 
1541   do {
1542     const uint8_t *s = src_ptr;
1543     int16_t *d = dst_ptr;
1544     int width = w;
1545 
1546     uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
1547     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1548 
1549     do {
1550       uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
1551       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1552 
1553       int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
1554       int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
1555       int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
1556       int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
1557       int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
1558       int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
1559       int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
1560 
1561       int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1562                                       horiz_const);
1563 
1564       vst1q_s16(d, d0);
1565 
1566       s0 = s8;
1567       s += 8;
1568       d += 8;
1569       width -= 8;
1570     } while (width != 0);
1571     src_ptr += src_stride;
1572     dst_ptr += dst_stride;
1573   } while (--height != 0);
1574 }
1575 
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1576 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1577                              int dst_stride, int w, int h,
1578                              const InterpFilterParams *filter_params_x,
1579                              const InterpFilterParams *filter_params_y,
1580                              const int subpel_x_qn, const int subpel_y_qn,
1581                              ConvolveParams *conv_params) {
1582   if (w == 2 || h == 2) {
1583     av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1584                          filter_params_x, filter_params_y, subpel_x_qn,
1585                          subpel_y_qn, conv_params);
1586     return;
1587   }
1588 
1589   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1590   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1591   const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1592   const int im_h = h + clamped_y_taps - 1;
1593   const int im_stride = MAX_SB_SIZE;
1594   const int vert_offset = clamped_y_taps / 2 - 1;
1595   const int horiz_offset = filter_params_x->taps / 2 - 1;
1596   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1597 
1598   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1599       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1600   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1601       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1602 
1603   if (filter_params_x->taps > 8) {
1604     DECLARE_ALIGNED(16, int16_t,
1605                     im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1606 
1607     const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1608     const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1609     const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1610     const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1611 
1612     convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
1613                                     im_h, x_filter_0_7, x_filter_8_11);
1614 
1615     convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1616                                    y_filter_0_7, y_filter_8_11);
1617   } else {
1618     DECLARE_ALIGNED(16, int16_t,
1619                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
1620 
1621     if (x_filter_taps <= 4) {
1622       convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
1623                                      im_stride, w, im_h, x_filter_ptr);
1624     } else {
1625       convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
1626                                      w, im_h, x_filter_ptr);
1627     }
1628 
1629     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1630 
1631     if (clamped_y_taps <= 4) {
1632       convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1633                                     y_filter_ptr);
1634     } else if (clamped_y_taps == 6) {
1635       convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1636                                     y_filter);
1637     } else {
1638       convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1639                                     y_filter);
1640     }
1641   }
1642 }
1643 
av1_convolve_x_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1644 void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
1645                                     uint8_t *dst, int dst_stride, int w, int h,
1646                                     const InterpFilterParams *filter_params_x,
1647                                     const int subpel_x_qn,
1648                                     ConvolveParams *conv_params) {
1649   assert(subpel_x_qn == 8);
1650   assert(filter_params_x->taps == 2);
1651   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1652   (void)filter_params_x;
1653   (void)subpel_x_qn;
1654   (void)conv_params;
1655 
1656   if (w <= 4) {
1657     do {
1658       uint8x8_t s0_0 = vld1_u8(src);
1659       uint8x8_t s0_1 = vld1_u8(src + 1);
1660       uint8x8_t s1_0 = vld1_u8(src + src_stride);
1661       uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1662 
1663       uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1664       uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1665 
1666       if (w == 2) {
1667         store_u8_2x1(dst + 0 * dst_stride, d0);
1668         store_u8_2x1(dst + 1 * dst_stride, d1);
1669       } else {
1670         store_u8_4x1(dst + 0 * dst_stride, d0);
1671         store_u8_4x1(dst + 1 * dst_stride, d1);
1672       }
1673 
1674       src += 2 * src_stride;
1675       dst += 2 * dst_stride;
1676       h -= 2;
1677     } while (h != 0);
1678   } else if (w == 8) {
1679     do {
1680       uint8x8_t s0_0 = vld1_u8(src);
1681       uint8x8_t s0_1 = vld1_u8(src + 1);
1682       uint8x8_t s1_0 = vld1_u8(src + src_stride);
1683       uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1684 
1685       uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1686       uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1687 
1688       vst1_u8(dst, d0);
1689       vst1_u8(dst + dst_stride, d1);
1690 
1691       src += 2 * src_stride;
1692       dst += 2 * dst_stride;
1693       h -= 2;
1694     } while (h != 0);
1695   } else {
1696     do {
1697       const uint8_t *src_ptr = src;
1698       uint8_t *dst_ptr = dst;
1699       int width = w;
1700 
1701       do {
1702         uint8x16_t s0 = vld1q_u8(src_ptr);
1703         uint8x16_t s1 = vld1q_u8(src_ptr + 1);
1704 
1705         uint8x16_t d0 = vrhaddq_u8(s0, s1);
1706 
1707         vst1q_u8(dst_ptr, d0);
1708 
1709         src_ptr += 16;
1710         dst_ptr += 16;
1711         width -= 16;
1712       } while (width != 0);
1713       src += src_stride;
1714       dst += dst_stride;
1715     } while (--h != 0);
1716   }
1717 }
1718 
av1_convolve_y_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1719 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
1720                                     uint8_t *dst, int dst_stride, int w, int h,
1721                                     const InterpFilterParams *filter_params_y,
1722                                     const int subpel_y_qn) {
1723   assert(subpel_y_qn == 8);
1724   assert(filter_params_y->taps == 2);
1725   (void)filter_params_y;
1726   (void)subpel_y_qn;
1727 
1728   if (w <= 4) {
1729     do {
1730       uint8x8_t s0 = load_unaligned_u8_4x1(src);
1731       uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
1732       uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
1733 
1734       uint8x8_t d0 = vrhadd_u8(s0, s1);
1735       uint8x8_t d1 = vrhadd_u8(s1, s2);
1736 
1737       if (w == 2) {
1738         store_u8_2x1(dst + 0 * dst_stride, d0);
1739         store_u8_2x1(dst + 1 * dst_stride, d1);
1740       } else {
1741         store_u8_4x1(dst + 0 * dst_stride, d0);
1742         store_u8_4x1(dst + 1 * dst_stride, d1);
1743       }
1744 
1745       src += 2 * src_stride;
1746       dst += 2 * dst_stride;
1747       h -= 2;
1748     } while (h != 0);
1749   } else if (w == 8) {
1750     do {
1751       uint8x8_t s0 = vld1_u8(src);
1752       uint8x8_t s1 = vld1_u8(src + src_stride);
1753       uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
1754 
1755       uint8x8_t d0 = vrhadd_u8(s0, s1);
1756       uint8x8_t d1 = vrhadd_u8(s1, s2);
1757 
1758       vst1_u8(dst, d0);
1759       vst1_u8(dst + dst_stride, d1);
1760 
1761       src += 2 * src_stride;
1762       dst += 2 * dst_stride;
1763       h -= 2;
1764     } while (h != 0);
1765   } else {
1766     do {
1767       const uint8_t *src_ptr = src;
1768       uint8_t *dst_ptr = dst;
1769       int height = h;
1770 
1771       do {
1772         uint8x16_t s0 = vld1q_u8(src_ptr);
1773         uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
1774 
1775         uint8x16_t d0 = vrhaddq_u8(s0, s1);
1776 
1777         vst1q_u8(dst_ptr, d0);
1778 
1779         src_ptr += src_stride;
1780         dst_ptr += dst_stride;
1781       } while (--height != 0);
1782       src += 16;
1783       dst += 16;
1784       w -= 16;
1785     } while (w != 0);
1786   }
1787 }
1788 
av1_convolve_2d_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1789 void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
1790                                      uint8_t *dst, int dst_stride, int w, int h,
1791                                      const InterpFilterParams *filter_params_x,
1792                                      const InterpFilterParams *filter_params_y,
1793                                      const int subpel_x_qn,
1794                                      const int subpel_y_qn,
1795                                      ConvolveParams *conv_params) {
1796   assert(subpel_x_qn == 8);
1797   assert(subpel_y_qn == 8);
1798   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1799   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1800   (void)filter_params_x;
1801   (void)subpel_x_qn;
1802   (void)filter_params_y;
1803   (void)subpel_y_qn;
1804   (void)conv_params;
1805 
1806   uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1807   int im_h = h + 1;
1808   int im_stride = w;
1809   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
1810 
1811   uint16_t *im = im_block;
1812 
1813   // Horizontal filter.
1814   if (w <= 4) {
1815     do {
1816       uint8x8_t s0 = vld1_u8(src);
1817       uint8x8_t s1 = vld1_u8(src + 1);
1818 
1819       uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
1820 
1821       // Safe to store the whole vector, the im buffer is big enough.
1822       vst1_u16(im, sum);
1823 
1824       src += src_stride;
1825       im += im_stride;
1826     } while (--im_h != 0);
1827   } else {
1828     do {
1829       const uint8_t *src_ptr = src;
1830       uint16_t *im_ptr = im;
1831       int width = w;
1832 
1833       do {
1834         uint8x8_t s0 = vld1_u8(src_ptr);
1835         uint8x8_t s1 = vld1_u8(src_ptr + 1);
1836 
1837         uint16x8_t sum = vaddl_u8(s0, s1);
1838 
1839         vst1q_u16(im_ptr, sum);
1840 
1841         src_ptr += 8;
1842         im_ptr += 8;
1843         width -= 8;
1844       } while (width != 0);
1845       src += src_stride;
1846       im += im_stride;
1847     } while (--im_h != 0);
1848   }
1849 
1850   im = im_block;
1851 
1852   // Vertical filter.
1853   if (w <= 4) {
1854     do {
1855       uint16x4_t s0 = vld1_u16(im);
1856       uint16x4_t s1 = vld1_u16(im + im_stride);
1857       uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
1858 
1859       uint16x4_t sum0 = vadd_u16(s0, s1);
1860       uint16x4_t sum1 = vadd_u16(s1, s2);
1861 
1862       uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
1863       uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
1864 
1865       if (w == 2) {
1866         store_u8_2x1(dst + 0 * dst_stride, d0);
1867         store_u8_2x1(dst + 1 * dst_stride, d1);
1868       } else {
1869         store_u8_4x1(dst + 0 * dst_stride, d0);
1870         store_u8_4x1(dst + 1 * dst_stride, d1);
1871       }
1872 
1873       im += 2 * im_stride;
1874       dst += 2 * dst_stride;
1875       h -= 2;
1876     } while (h != 0);
1877   } else {
1878     do {
1879       uint16_t *im_ptr = im;
1880       uint8_t *dst_ptr = dst;
1881       int height = h;
1882 
1883       do {
1884         uint16x8_t s0 = vld1q_u16(im_ptr);
1885         uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
1886 
1887         uint16x8_t sum = vaddq_u16(s0, s1);
1888         uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
1889 
1890         vst1_u8(dst_ptr, d0);
1891 
1892         im_ptr += im_stride;
1893         dst_ptr += dst_stride;
1894       } while (--height != 0);
1895       im += 8;
1896       dst += 8;
1897       w -= 8;
1898     } while (w != 0);
1899   }
1900 }
1901