xref: /aosp_15_r20/external/libaom/av1/common/arm/compound_convolve_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "aom_dsp/arm/transpose_neon.h"
17 #include "av1/common/arm/compound_convolve_neon.h"
18 #include "config/aom_config.h"
19 #include "config/av1_rtcd.h"
20 
convolve4_4_2d_h(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t x_filter,const int16x4_t horiz_const)21 static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
22                                          const int16x4_t s2, const int16x4_t s3,
23                                          const int16x4_t x_filter,
24                                          const int16x4_t horiz_const) {
25   int16x4_t sum = horiz_const;
26   sum = vmla_lane_s16(sum, s0, x_filter, 0);
27   sum = vmla_lane_s16(sum, s1, x_filter, 1);
28   sum = vmla_lane_s16(sum, s2, x_filter, 2);
29   sum = vmla_lane_s16(sum, s3, x_filter, 3);
30 
31   // We halved the convolution filter values so -1 from the right shift.
32   return vshr_n_s16(sum, ROUND0_BITS - 1);
33 }
34 
convolve8_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t x_filter,const int16x8_t horiz_const)35 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
36                                          const int16x8_t s2, const int16x8_t s3,
37                                          const int16x8_t s4, const int16x8_t s5,
38                                          const int16x8_t s6, const int16x8_t s7,
39                                          const int16x8_t x_filter,
40                                          const int16x8_t horiz_const) {
41   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
42   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
43 
44   int16x8_t sum = horiz_const;
45   sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
46   sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
47   sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
48   sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
49   sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
50   sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
51   sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
52   sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
53 
54   // We halved the convolution filter values so -1 from the right shift.
55   return vshrq_n_s16(sum, ROUND0_BITS - 1);
56 }
57 
dist_wtd_convolve_2d_horiz_neon(const uint8_t * src,int src_stride,int16_t * im_block,const int im_stride,const int16_t * x_filter_ptr,const int im_h,int w)58 static inline void dist_wtd_convolve_2d_horiz_neon(
59     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
60     const int16_t *x_filter_ptr, const int im_h, int w) {
61   const int bd = 8;
62 
63   const uint8_t *src_ptr = src;
64   int16_t *dst_ptr = im_block;
65   int dst_stride = im_stride;
66   int height = im_h;
67 
68   if (w == 4) {
69     // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
70     // shifts - which are generally faster than rounding shifts on modern CPUs.
71     // (The extra -1 is needed because we halved the filter values.)
72     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
73                                              (1 << ((ROUND0_BITS - 1) - 1)));
74     // 4-tap filters are used for blocks having width <= 4.
75     // Filter values are even, so halve to reduce intermediate precision reqs.
76     const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
77 
78     src_ptr += 2;
79 
80     do {
81       uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
82       int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
83       int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
84 
85       __builtin_prefetch(dst_ptr);
86 
87       int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
88       int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
89       int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
90 
91       int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
92 
93       vst1_s16(dst_ptr, d0);
94 
95       src_ptr += src_stride;
96       dst_ptr += dst_stride;
97     } while (--height != 0);
98   } else {
99     // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
100     // shifts - which are generally faster than rounding shifts on modern CPUs.
101     // (The extra -1 is needed because we halved the filter values.)
102     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
103                                               (1 << ((ROUND0_BITS - 1) - 1)));
104     // Filter values are even, so halve to reduce intermediate precision reqs.
105     const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
106 
107 #if AOM_ARCH_AARCH64
108     do {
109       const uint8_t *s;
110       int16_t *d = dst_ptr;
111       int width = w;
112 
113       __builtin_prefetch(src_ptr + 0 * src_stride);
114       __builtin_prefetch(src_ptr + 1 * src_stride);
115       __builtin_prefetch(src_ptr + 2 * src_stride);
116       __builtin_prefetch(src_ptr + 3 * src_stride);
117       __builtin_prefetch(src_ptr + 4 * src_stride);
118       __builtin_prefetch(src_ptr + 5 * src_stride);
119       __builtin_prefetch(src_ptr + 6 * src_stride);
120       __builtin_prefetch(src_ptr + 7 * src_stride);
121 
122       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
123       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
124       transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
125 
126       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
127       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
128       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
129       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
130       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
131       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
132       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
133 
134       s = src_ptr + 7;
135 
136       __builtin_prefetch(dst_ptr + 0 * dst_stride);
137       __builtin_prefetch(dst_ptr + 1 * dst_stride);
138       __builtin_prefetch(dst_ptr + 2 * dst_stride);
139       __builtin_prefetch(dst_ptr + 3 * dst_stride);
140       __builtin_prefetch(dst_ptr + 4 * dst_stride);
141       __builtin_prefetch(dst_ptr + 5 * dst_stride);
142       __builtin_prefetch(dst_ptr + 6 * dst_stride);
143       __builtin_prefetch(dst_ptr + 7 * dst_stride);
144 
145       do {
146         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
147         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
148 
149         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
150         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
151         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
152         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
153         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
154         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
155         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
156         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
157 
158         int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
159                                         x_filter, horiz_const);
160         int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
161                                         x_filter, horiz_const);
162         int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
163                                         x_filter, horiz_const);
164         int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
165                                         x_filter, horiz_const);
166         int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
167                                         x_filter, horiz_const);
168         int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
169                                         x_filter, horiz_const);
170         int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
171                                         x_filter, horiz_const);
172         int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
173                                         x_filter, horiz_const);
174 
175         transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
176         store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
177 
178         s0 = s8;
179         s1 = s9;
180         s2 = s10;
181         s3 = s11;
182         s4 = s12;
183         s5 = s13;
184         s6 = s14;
185         s += 8;
186         d += 8;
187         width -= 8;
188       } while (width > 0);
189       src_ptr += 8 * src_stride;
190       dst_ptr += 8 * dst_stride;
191       height -= 8;
192     } while (height > 8);
193 #endif  // AOM_ARCH_AARCH64
194 
195     do {
196       const uint8_t *s;
197       int16_t *d = dst_ptr;
198       int width = w;
199 
200       uint8x8_t t0 = vld1_u8(src_ptr);
201       int16x8_t s0 =
202           vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
203 
204       s = src_ptr + 8;
205       __builtin_prefetch(dst_ptr);
206 
207       do {
208         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
209         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
210 
211         int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
212         int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
213         int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
214         int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
215         int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
216         int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
217         int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
218 
219         int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
220                                         x_filter, horiz_const);
221         vst1q_s16(d, d0);
222 
223         s0 = s8;
224         s += 8;
225         d += 8;
226         width -= 8;
227       } while (width > 0);
228       src_ptr += src_stride;
229       dst_ptr += dst_stride;
230     } while (--height != 0);
231   }
232 }
233 
av1_dist_wtd_convolve_2d_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)234 void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
235                                    uint8_t *dst8, int dst8_stride, int w, int h,
236                                    const InterpFilterParams *filter_params_x,
237                                    const InterpFilterParams *filter_params_y,
238                                    const int subpel_x_qn, const int subpel_y_qn,
239                                    ConvolveParams *conv_params) {
240   assert(w % 4 == 0);
241   assert(h % 4 == 0);
242 
243   DECLARE_ALIGNED(16, int16_t,
244                   im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
245 
246   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
247   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
248 
249   const int im_h = h + clamped_y_taps - 1;
250   const int im_stride = MAX_SB_SIZE;
251   const int vert_offset = clamped_y_taps / 2 - 1;
252   const int horiz_offset = filter_params_x->taps / 2 - 1;
253   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
254   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
255       filter_params_x, subpel_x_qn & SUBPEL_MASK);
256   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
257       filter_params_y, subpel_y_qn & SUBPEL_MASK);
258 
259   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
260 
261   dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
262                                   x_filter_ptr, im_h, w);
263 
264   if (clamped_y_taps == 6) {
265     if (conv_params->do_average) {
266       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
267         dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
268             im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
269             w);
270       } else {
271         dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
272                                                 dst8_stride, conv_params,
273                                                 y_filter, h, w);
274       }
275     } else {
276       dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
277                                           y_filter, h, w);
278     }
279   } else {
280     if (conv_params->do_average) {
281       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
282         dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
283             im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
284             w);
285       } else {
286         dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
287                                                 dst8_stride, conv_params,
288                                                 y_filter, h, w);
289       }
290     } else {
291       dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
292                                           y_filter, h, w);
293     }
294   }
295 }
296 
dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)297 static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
298     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
299     int h, ConvolveParams *conv_params) {
300   assert(w % 4 == 0);
301   assert(h % 4 == 0);
302 
303   const int bd = 8;
304   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
305   const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
306                                 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
307   const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
308   const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
309 
310   const uint16_t fwd_offset = conv_params->fwd_offset;
311   const uint16_t bck_offset = conv_params->bck_offset;
312 
313   CONV_BUF_TYPE *dst = conv_params->dst;
314   const int dst_stride = conv_params->dst_stride;
315   int height = h;
316 
317   if (w == 4) {
318     do {
319       uint8x8_t s0, s1, s2, s3;
320       load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
321 
322       uint16x4_t d0 =
323           vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
324       uint16x4_t d1 =
325           vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
326       uint16x4_t d2 =
327           vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
328       uint16x4_t d3 =
329           vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
330 
331       uint16x4_t dd0, dd1, dd2, dd3;
332       load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
333 
334       uint8x8_t d01, d23;
335       compute_dist_wtd_avg_4x4(
336           dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
337           vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
338 
339       store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
340       store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
341 
342       src += 4 * src_stride;
343       dst += 4 * dst_stride;
344       dst8 += 4 * dst8_stride;
345       height -= 4;
346     } while (height != 0);
347   } else {
348     do {
349       const uint8_t *s = src;
350       CONV_BUF_TYPE *d = dst;
351       uint8_t *d_u8 = dst8;
352       int width = w;
353 
354       do {
355         uint8x8_t s0, s1, s2, s3;
356         load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
357 
358         uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
359         uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
360         uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
361         uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
362 
363         uint16x8_t dd0, dd1, dd2, dd3;
364         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
365 
366         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
367         compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
368                                  bck_offset,
369                                  vreinterpretq_s16_u16(round_offset_vec),
370                                  &d0_u8, &d1_u8, &d2_u8, &d3_u8);
371 
372         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
373 
374         s += 8;
375         d += 8;
376         d_u8 += 8;
377         width -= 8;
378       } while (width != 0);
379       src += 4 * src_stride;
380       dst += 4 * dst_stride;
381       dst8 += 4 * dst8_stride;
382       height -= 4;
383     } while (height != 0);
384   }
385 }
386 
dist_wtd_convolve_2d_copy_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)387 static inline void dist_wtd_convolve_2d_copy_avg_neon(
388     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
389     int h, ConvolveParams *conv_params) {
390   assert(w % 4 == 0);
391   assert(h % 4 == 0);
392 
393   const int bd = 8;
394   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
395   const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
396                                 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
397   const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
398   const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
399 
400   CONV_BUF_TYPE *dst = conv_params->dst;
401   const int dst_stride = conv_params->dst_stride;
402   int height = h;
403 
404   if (w == 4) {
405     do {
406       uint8x8_t s0, s1, s2, s3;
407       load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
408 
409       uint16x4_t d0 =
410           vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
411       uint16x4_t d1 =
412           vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
413       uint16x4_t d2 =
414           vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
415       uint16x4_t d3 =
416           vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
417 
418       uint16x4_t dd0, dd1, dd2, dd3;
419       load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
420 
421       uint8x8_t d01, d23;
422       compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
423                             vreinterpretq_s16_u16(round_offset_vec), &d01,
424                             &d23);
425 
426       store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
427       store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
428 
429       src += 4 * src_stride;
430       dst += 4 * dst_stride;
431       dst8 += 4 * dst8_stride;
432       height -= 4;
433     } while (height != 0);
434   } else {
435     do {
436       const uint8_t *s = src;
437       CONV_BUF_TYPE *d = dst;
438       uint8_t *d_u8 = dst8;
439       int width = w;
440 
441       do {
442         uint8x8_t s0, s1, s2, s3;
443         load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
444 
445         uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
446         uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
447         uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
448         uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
449 
450         uint16x8_t dd0, dd1, dd2, dd3;
451         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
452 
453         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
454         compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
455                               vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
456                               &d1_u8, &d2_u8, &d3_u8);
457 
458         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
459 
460         s += 8;
461         d += 8;
462         d_u8 += 8;
463         width -= 8;
464       } while (width != 0);
465       src += 4 * src_stride;
466       dst += 4 * dst_stride;
467       dst8 += 4 * dst8_stride;
468       height -= 4;
469     } while (height != 0);
470   }
471 }
472 
dist_wtd_convolve_2d_copy_neon(const uint8_t * src,int src_stride,int w,int h,ConvolveParams * conv_params)473 static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
474                                                   int src_stride, int w, int h,
475                                                   ConvolveParams *conv_params) {
476   assert(w % 4 == 0);
477   assert(h % 4 == 0);
478 
479   const int bd = 8;
480   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
481   const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
482                                 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
483   const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
484   const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
485 
486   CONV_BUF_TYPE *dst = conv_params->dst;
487   const int dst_stride = conv_params->dst_stride;
488   int height = h;
489 
490   if (w == 4) {
491     do {
492       uint8x8_t s0, s1, s2, s3;
493       load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
494 
495       uint16x4_t d0 =
496           vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
497       uint16x4_t d1 =
498           vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
499       uint16x4_t d2 =
500           vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
501       uint16x4_t d3 =
502           vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
503 
504       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
505 
506       src += 4 * src_stride;
507       dst += 4 * dst_stride;
508       height -= 4;
509     } while (height != 0);
510   } else {
511     do {
512       const uint8_t *s = src;
513       CONV_BUF_TYPE *d = dst;
514       int width = w;
515 
516       do {
517         uint8x8_t s0, s1, s2, s3;
518         load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
519 
520         uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
521         uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
522         uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
523         uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
524 
525         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
526 
527         s += 8;
528         d += 8;
529         width -= 8;
530       } while (width != 0);
531       src += 4 * src_stride;
532       dst += 4 * dst_stride;
533       height -= 4;
534     } while (height != 0);
535   }
536 }
537 
av1_dist_wtd_convolve_2d_copy_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)538 void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
539                                         uint8_t *dst8, int dst8_stride, int w,
540                                         int h, ConvolveParams *conv_params) {
541   if (conv_params->do_average) {
542     if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
543       dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
544           src, src_stride, dst8, dst8_stride, w, h, conv_params);
545     } else {
546       dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
547                                          h, conv_params);
548     }
549   } else {
550     dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
551   }
552 }
553 
convolve4_4_x(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t x_filter,const int16x4_t round_offset)554 static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
555                                        const int16x4_t s2, const int16x4_t s3,
556                                        const int16x4_t x_filter,
557                                        const int16x4_t round_offset) {
558   int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
559   sum = vmla_lane_s16(sum, s1, x_filter, 1);
560   sum = vmla_lane_s16(sum, s2, x_filter, 2);
561   sum = vmla_lane_s16(sum, s3, x_filter, 3);
562 
563   // We halved the convolution filter values so -1 from the right shift.
564   int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
565   return vreinterpret_u16_s16(res);
566 }
567 
convolve8_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t x_filter,const int16x8_t round_offset)568 static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
569                                        const int16x8_t s2, const int16x8_t s3,
570                                        const int16x8_t s4, const int16x8_t s5,
571                                        const int16x8_t s6, const int16x8_t s7,
572                                        const int16x8_t x_filter,
573                                        const int16x8_t round_offset) {
574   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
575   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
576 
577   int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
578   sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
579   sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
580   sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
581   sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
582   sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
583   sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
584   sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
585 
586   // We halved the convolution filter values so -1 from the right shift.
587   int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
588   return vreinterpretq_u16_s16(res);
589 }
590 
dist_wtd_convolve_x_dist_wtd_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)591 static inline void dist_wtd_convolve_x_dist_wtd_avg_neon(
592     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
593     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
594     ConvolveParams *conv_params) {
595   assert(w % 4 == 0);
596   assert(h % 4 == 0);
597 
598   const int bd = 8;
599   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
600   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
601                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
602   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
603 
604   const uint16_t fwd_offset = conv_params->fwd_offset;
605   const uint16_t bck_offset = conv_params->bck_offset;
606 
607   // Horizontal filter.
608   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
609       filter_params_x, subpel_x_qn & SUBPEL_MASK);
610 
611   const int horiz_offset = filter_params_x->taps / 2 - 1;
612   const uint8_t *src_ptr = src - horiz_offset;
613   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
614   uint8_t *dst8_ptr = dst8;
615   int dst_stride = conv_params->dst_stride;
616   int height = h;
617 
618   if (w == 4) {
619     // 4-tap filters are used for blocks having width <= 4.
620     // Filter values are even, so halve to reduce intermediate precision reqs.
621     const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
622 
623     src_ptr += 2;
624 
625     do {
626       uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
627       int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
628       int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
629 
630       __builtin_prefetch(dst_ptr);
631       __builtin_prefetch(dst8_ptr);
632 
633       int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
634       int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
635       int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
636 
637       uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
638                                     vget_low_s16(round_offset_vec));
639 
640       uint16x4_t dd0 = vld1_u16(dst_ptr);
641 
642       uint8x8_t d01;
643       compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
644                                vget_low_s16(round_offset_vec), &d01);
645 
646       store_u8_4x1(dst8_ptr, d01);
647 
648       src_ptr += src_stride;
649       dst_ptr += dst_stride;
650       dst8_ptr += dst8_stride;
651     } while (--height != 0);
652   } else {
653     // Filter values are even, so halve to reduce intermediate precision reqs.
654     const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
655 
656 #if AOM_ARCH_AARCH64
657     while (height >= 8) {
658       const uint8_t *s = src_ptr;
659       CONV_BUF_TYPE *d = dst_ptr;
660       uint8_t *d_u8 = dst8_ptr;
661       int width = w;
662 
663       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
664       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
665       transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
666 
667       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
668       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
669       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
670       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
671       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
672       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
673       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
674 
675       __builtin_prefetch(d + 0 * dst_stride);
676       __builtin_prefetch(d + 1 * dst_stride);
677       __builtin_prefetch(d + 2 * dst_stride);
678       __builtin_prefetch(d + 3 * dst_stride);
679       __builtin_prefetch(d + 4 * dst_stride);
680       __builtin_prefetch(d + 5 * dst_stride);
681       __builtin_prefetch(d + 6 * dst_stride);
682       __builtin_prefetch(d + 7 * dst_stride);
683 
684       s += 7;
685 
686       do {
687         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
688         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
689 
690         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
691         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
692         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
693         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
694         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
695         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
696         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
697         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
698 
699         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
700                                       round_offset_vec);
701         uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
702                                       round_offset_vec);
703         uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
704                                       round_offset_vec);
705         uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
706                                       round_offset_vec);
707         uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
708                                       x_filter, round_offset_vec);
709         uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
710                                       x_filter, round_offset_vec);
711         uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
712                                       x_filter, round_offset_vec);
713         uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
714                                       x_filter, round_offset_vec);
715 
716         transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
717 
718         uint16x8_t dd0, dd1, dd2, dd3;
719         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
720 
721         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
722         compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
723                                  bck_offset, round_offset_vec, &d0_u8, &d1_u8,
724                                  &d2_u8, &d3_u8);
725 
726         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
727 
728         uint16x8_t dd4, dd5, dd6, dd7;
729         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
730 
731         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
732         compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
733                                  bck_offset, round_offset_vec, &d4_u8, &d5_u8,
734                                  &d6_u8, &d7_u8);
735 
736         store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
737                      d7_u8);
738 
739         s0 = s8;
740         s1 = s9;
741         s2 = s10;
742         s3 = s11;
743         s4 = s12;
744         s5 = s13;
745         s6 = s14;
746         s += 8;
747         d += 8;
748         d_u8 += 8;
749         width -= 8;
750       } while (width != 0);
751       src_ptr += 8 * src_stride;
752       dst_ptr += 8 * dst_stride;
753       dst8_ptr += 8 * dst8_stride;
754       height -= 8;
755     }
756 #endif  // AOM_ARCH_AARCH64
757 
758     while (height > 0) {
759       const uint8_t *s = src_ptr;
760       CONV_BUF_TYPE *d = dst_ptr;
761       uint8_t *d_u8 = dst8_ptr;
762       int width = w;
763 
764       uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
765       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
766 
767       __builtin_prefetch(d);
768 
769       s += 8;
770 
771       do {
772         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
773         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
774 
775         int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
776         int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
777         int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
778         int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
779         int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
780         int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
781         int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
782 
783         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
784                                       round_offset_vec);
785 
786         uint16x8_t dd0 = vld1q_u16(d);
787 
788         uint8x8_t d0_u8;
789         compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
790                                  round_offset_vec, &d0_u8);
791 
792         vst1_u8(d_u8, d0_u8);
793 
794         s0 = s8;
795         s += 8;
796         d += 8;
797         d_u8 += 8;
798         width -= 8;
799       } while (width != 0);
800       src_ptr += src_stride;
801       dst_ptr += dst_stride;
802       dst8_ptr += dst8_stride;
803       height--;
804     }
805   }
806 }
807 
dist_wtd_convolve_x_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)808 static inline void dist_wtd_convolve_x_avg_neon(
809     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
810     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
811     ConvolveParams *conv_params) {
812   assert(w % 4 == 0);
813   assert(h % 4 == 0);
814 
815   const int bd = 8;
816   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
817   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
818                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
819   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
820 
821   // Horizontal filter.
822   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
823       filter_params_x, subpel_x_qn & SUBPEL_MASK);
824 
825   const int horiz_offset = filter_params_x->taps / 2 - 1;
826   const uint8_t *src_ptr = src - horiz_offset;
827   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
828   uint8_t *dst8_ptr = dst8;
829   int dst_stride = conv_params->dst_stride;
830   int height = h;
831 
832   if (w == 4) {
833     // 4-tap filters are used for blocks having width <= 4.
834     // Filter values are even, so halve to reduce intermediate precision reqs.
835     const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
836 
837     src_ptr += 2;
838 
839     do {
840       uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
841       int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
842       int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
843 
844       __builtin_prefetch(dst_ptr);
845       __builtin_prefetch(dst8_ptr);
846 
847       int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
848       int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
849       int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
850 
851       uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
852                                     vget_low_s16(round_offset_vec));
853 
854       uint16x4_t dd0 = vld1_u16(dst_ptr);
855 
856       uint8x8_t d01;
857       compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
858 
859       store_u8_4x1(dst8_ptr, d01);
860 
861       src_ptr += src_stride;
862       dst_ptr += dst_stride;
863       dst8_ptr += dst8_stride;
864     } while (--height != 0);
865   } else {
866     // Filter values are even, so halve to reduce intermediate precision reqs.
867     const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
868 
869 #if AOM_ARCH_AARCH64
870     while (height >= 8) {
871       const uint8_t *s = src_ptr;
872       CONV_BUF_TYPE *d = dst_ptr;
873       uint8_t *d_u8 = dst8_ptr;
874       int width = w;
875 
876       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
877       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
878       transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
879 
880       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
881       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
882       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
883       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
884       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
885       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
886       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
887 
888       __builtin_prefetch(d + 0 * dst_stride);
889       __builtin_prefetch(d + 1 * dst_stride);
890       __builtin_prefetch(d + 2 * dst_stride);
891       __builtin_prefetch(d + 3 * dst_stride);
892       __builtin_prefetch(d + 4 * dst_stride);
893       __builtin_prefetch(d + 5 * dst_stride);
894       __builtin_prefetch(d + 6 * dst_stride);
895       __builtin_prefetch(d + 7 * dst_stride);
896 
897       s += 7;
898 
899       do {
900         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
901         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
902 
903         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
904         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
905         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
906         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
907         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
908         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
909         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
910         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
911 
912         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
913                                       round_offset_vec);
914         uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
915                                       round_offset_vec);
916         uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
917                                       round_offset_vec);
918         uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
919                                       round_offset_vec);
920         uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
921                                       x_filter, round_offset_vec);
922         uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
923                                       x_filter, round_offset_vec);
924         uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
925                                       x_filter, round_offset_vec);
926         uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
927                                       x_filter, round_offset_vec);
928 
929         transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
930 
931         uint16x8_t dd0, dd1, dd2, dd3;
932         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
933 
934         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
935         compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
936                               round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
937 
938         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
939 
940         uint16x8_t dd4, dd5, dd6, dd7;
941         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
942 
943         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
944         compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
945                               round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
946 
947         store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
948                      d7_u8);
949 
950         s0 = s8;
951         s1 = s9;
952         s2 = s10;
953         s3 = s11;
954         s4 = s12;
955         s5 = s13;
956         s6 = s14;
957         s += 8;
958         d += 8;
959         d_u8 += 8;
960         width -= 8;
961       } while (width != 0);
962       src_ptr += 8 * src_stride;
963       dst_ptr += 8 * dst_stride;
964       dst8_ptr += 8 * dst8_stride;
965       height -= 8;
966     }
967 #endif  // AOM_ARCH_AARCH64
968 
969     while (height > 0) {
970       const uint8_t *s = src_ptr;
971       CONV_BUF_TYPE *d = dst_ptr;
972       uint8_t *d_u8 = dst8_ptr;
973       int width = w;
974 
975       uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
976       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
977 
978       __builtin_prefetch(d);
979 
980       s += 8;
981 
982       do {
983         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
984         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
985 
986         int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
987         int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
988         int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
989         int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
990         int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
991         int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
992         int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
993 
994         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
995                                       round_offset_vec);
996 
997         uint16x8_t dd0 = vld1q_u16(d);
998 
999         uint8x8_t d0_u8;
1000         compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
1001 
1002         vst1_u8(d_u8, d0_u8);
1003 
1004         s0 = s8;
1005         s += 8;
1006         d += 8;
1007         d_u8 += 8;
1008         width -= 8;
1009       } while (width != 0);
1010       src_ptr += src_stride;
1011       dst_ptr += dst_stride;
1012       dst8_ptr += dst8_stride;
1013       height--;
1014     }
1015   }
1016 }
1017 
dist_wtd_convolve_x_neon(const uint8_t * src,int src_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1018 static inline void dist_wtd_convolve_x_neon(
1019     const uint8_t *src, int src_stride, int w, int h,
1020     const InterpFilterParams *filter_params_x, const int subpel_x_qn,
1021     ConvolveParams *conv_params) {
1022   assert(w % 4 == 0);
1023   assert(h % 4 == 0);
1024 
1025   const int bd = 8;
1026   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1027   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1028                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1029   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1030 
1031   // Horizontal filter.
1032   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1033       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1034 
1035   const int horiz_offset = filter_params_x->taps / 2 - 1;
1036   const uint8_t *src_ptr = src - horiz_offset;
1037   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1038   int dst_stride = conv_params->dst_stride;
1039   int height = h;
1040 
1041   if (w == 4) {
1042     // 4-tap filters are used for blocks having width <= 4.
1043     // Filter values are even, so halve to reduce intermediate precision reqs.
1044     const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
1045 
1046     src_ptr += 2;
1047 
1048     do {
1049       uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
1050       int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1051       int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1052 
1053       __builtin_prefetch(dst_ptr);
1054 
1055       int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
1056       int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
1057       int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
1058 
1059       uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
1060                                     vget_low_s16(round_offset_vec));
1061 
1062       vst1_u16(dst_ptr, d0);
1063 
1064       src_ptr += src_stride;
1065       dst_ptr += dst_stride;
1066     } while (--height != 0);
1067   } else {
1068     // Filter values are even, so halve to reduce intermediate precision reqs.
1069     const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
1070 
1071 #if AOM_ARCH_AARCH64
1072     while (height >= 8) {
1073       const uint8_t *s = src_ptr;
1074       CONV_BUF_TYPE *d = dst_ptr;
1075       int width = w;
1076 
1077       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
1078       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1079       transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1080 
1081       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1082       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1083       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1084       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1085       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1086       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1087       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1088 
1089       __builtin_prefetch(d + 0 * dst_stride);
1090       __builtin_prefetch(d + 1 * dst_stride);
1091       __builtin_prefetch(d + 2 * dst_stride);
1092       __builtin_prefetch(d + 3 * dst_stride);
1093       __builtin_prefetch(d + 4 * dst_stride);
1094       __builtin_prefetch(d + 5 * dst_stride);
1095       __builtin_prefetch(d + 6 * dst_stride);
1096       __builtin_prefetch(d + 7 * dst_stride);
1097 
1098       s += 7;
1099 
1100       do {
1101         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1103 
1104         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1105         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1106         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1107         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1108         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1109         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1110         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1111         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1112 
1113         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1114                                       round_offset_vec);
1115         uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
1116                                       round_offset_vec);
1117         uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
1118                                       round_offset_vec);
1119         uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
1120                                       round_offset_vec);
1121         uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
1122                                       x_filter, round_offset_vec);
1123         uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
1124                                       x_filter, round_offset_vec);
1125         uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
1126                                       x_filter, round_offset_vec);
1127         uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
1128                                       x_filter, round_offset_vec);
1129 
1130         transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
1131 
1132         store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1133 
1134         s0 = s8;
1135         s1 = s9;
1136         s2 = s10;
1137         s3 = s11;
1138         s4 = s12;
1139         s5 = s13;
1140         s6 = s14;
1141         s += 8;
1142         d += 8;
1143         width -= 8;
1144       } while (width != 0);
1145       src_ptr += 8 * src_stride;
1146       dst_ptr += 8 * dst_stride;
1147       height -= 8;
1148     }
1149 #endif  // AOM_ARCH_AARCH64
1150 
1151     while (height > 0) {
1152       const uint8_t *s = src_ptr;
1153       CONV_BUF_TYPE *d = dst_ptr;
1154       int width = w;
1155 
1156       uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
1157       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1158 
1159       __builtin_prefetch(d);
1160 
1161       s = src_ptr + 8;
1162 
1163       do {
1164         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
1165         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
1166 
1167         int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
1168         int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
1169         int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
1170         int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
1171         int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
1172         int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
1173         int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
1174 
1175         uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1176                                       round_offset_vec);
1177 
1178         vst1q_u16(d, d0);
1179 
1180         s0 = s8;
1181         s += 8;
1182         d += 8;
1183         width -= 8;
1184       } while (width != 0);
1185       src_ptr += src_stride;
1186       dst_ptr += dst_stride;
1187       height--;
1188     }
1189   }
1190 }
1191 
av1_dist_wtd_convolve_x_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1192 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
1193                                   uint8_t *dst8, int dst8_stride, int w, int h,
1194                                   const InterpFilterParams *filter_params_x,
1195                                   const int subpel_x_qn,
1196                                   ConvolveParams *conv_params) {
1197   if (conv_params->do_average) {
1198     if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
1199       dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
1200                                             w, h, filter_params_x, subpel_x_qn,
1201                                             conv_params);
1202     } else {
1203       dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
1204                                    filter_params_x, subpel_x_qn, conv_params);
1205     }
1206   } else {
1207     dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
1208                              subpel_x_qn, conv_params);
1209   }
1210 }
1211 
convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int16x4_t round_offset)1212 static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
1213                                        const int16x4_t s2, const int16x4_t s3,
1214                                        const int16x4_t s4, const int16x4_t s5,
1215                                        const int16x8_t y_filter,
1216                                        const int16x4_t round_offset) {
1217   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1218   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1219 
1220   // Filter values at indices 0 and 7 are 0.
1221   int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
1222   sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
1223   sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
1224   sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
1225   sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
1226   sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
1227 
1228   // We halved the convolution filter values so -1 from the right shift.
1229   int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
1230   return vreinterpret_u16_s16(res);
1231 }
1232 
convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int16x8_t round_offset)1233 static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
1234                                        const int16x8_t s2, const int16x8_t s3,
1235                                        const int16x8_t s4, const int16x8_t s5,
1236                                        const int16x8_t y_filter,
1237                                        const int16x8_t round_offset) {
1238   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1239   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1240 
1241   // Filter values at indices 0 and 7 are 0.
1242   int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
1243   sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
1244   sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
1245   sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
1246   sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
1247   sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
1248 
1249   // We halved the convolution filter values so -1 from the right shift.
1250   int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
1251   return vreinterpretq_u16_s16(res);
1252 }
1253 
dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1254 static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
1255     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1256     const int dst8_stride, int w, int h, const int16x8_t y_filter,
1257     ConvolveParams *conv_params) {
1258   const int bd = 8;
1259   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1260   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1261                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1262   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1263 
1264   const uint16_t fwd_offset = conv_params->fwd_offset;
1265   const uint16_t bck_offset = conv_params->bck_offset;
1266 
1267   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1268   const int dst_stride = conv_params->dst_stride;
1269   int width = w;
1270 
1271   if (w == 4 || h == 4) {
1272     do {
1273       const uint8_t *s = src_ptr;
1274       CONV_BUF_TYPE *d = dst_ptr;
1275       uint8_t *d_u8 = dst8_ptr;
1276       int height = h;
1277 
1278       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1279       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1280       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1281       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1282       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1283 
1284       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1285       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1286       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1287       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1288       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1289 
1290       s += 5 * src_stride;
1291 
1292       do {
1293 #if AOM_ARCH_AARCH64
1294         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1295         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1296         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1297         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1298 
1299         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1300         int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1301         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1302         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1303 
1304         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1305                                       vget_low_s16(round_offset_vec));
1306         uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1307                                       vget_low_s16(round_offset_vec));
1308         uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1309                                       vget_low_s16(round_offset_vec));
1310         uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1311                                       vget_low_s16(round_offset_vec));
1312 
1313         uint16x4_t dd0, dd1, dd2, dd3;
1314         load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1315 
1316         uint8x8_t d01, d23;
1317         compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1318                                  bck_offset, round_offset_vec, &d01, &d23);
1319 
1320         store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1321         store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1322 
1323         s0 = s4;
1324         s1 = s5;
1325         s2 = s6;
1326         s3 = s7;
1327         s4 = s8;
1328         s += 4 * src_stride;
1329         d += 4 * dst_stride;
1330         d_u8 += 4 * dst8_stride;
1331         height -= 4;
1332 #else   // !AOM_ARCH_AARCH64
1333         t0 = load_unaligned_u8_4x1(s);
1334         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1335 
1336         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1337                                       vget_low_s16(round_offset_vec));
1338 
1339         uint16x4_t dd0 = vld1_u16(d);
1340 
1341         uint8x8_t d01;
1342         compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
1343                                  vget_low_s16(round_offset_vec), &d01);
1344 
1345         store_u8_4x1(d_u8, d01);
1346 
1347         s0 = s1;
1348         s1 = s2;
1349         s2 = s3;
1350         s3 = s4;
1351         s4 = s5;
1352         s += src_stride;
1353         d += dst_stride;
1354         d_u8 += dst8_stride;
1355         height--;
1356 #endif  // AOM_ARCH_AARCH64
1357       } while (height != 0);
1358       src_ptr += 4;
1359       dst_ptr += 4;
1360       dst8_ptr += 4;
1361       width -= 4;
1362     } while (width != 0);
1363   } else {
1364     do {
1365       const uint8_t *s = src_ptr + (5 * src_stride);
1366       CONV_BUF_TYPE *d = dst_ptr;
1367       uint8_t *d_u8 = dst8_ptr;
1368       int height = h;
1369 
1370       uint8x8_t t0, t1, t2, t3, t4;
1371       load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1372 
1373       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1374       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1375       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1376       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1377       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1378 
1379       do {
1380 #if AOM_ARCH_AARCH64
1381         uint8x8_t t5, t6, t7;
1382         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1383 
1384         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1385         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1386         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1387         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1388         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1389         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1390         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1391         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1392 
1393         uint16x8_t d0 =
1394             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1395         uint16x8_t d1 =
1396             convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1397         uint16x8_t d2 =
1398             convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1399         uint16x8_t d3 =
1400             convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1401         uint16x8_t d4 =
1402             convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1403         uint16x8_t d5 =
1404             convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1405         uint16x8_t d6 =
1406             convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1407         uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1408                                       round_offset_vec);
1409 
1410         uint16x8_t dd0, dd1, dd2, dd3;
1411         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1412 
1413         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
1414         compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1415                                  bck_offset, round_offset_vec, &d0_u8, &d1_u8,
1416                                  &d2_u8, &d3_u8);
1417 
1418         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
1419         d_u8 += 4 * dst8_stride;
1420 
1421         uint16x8_t dd4, dd5, dd6, dd7;
1422         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
1423 
1424         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
1425         compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
1426                                  bck_offset, round_offset_vec, &d4_u8, &d5_u8,
1427                                  &d6_u8, &d7_u8);
1428 
1429         store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
1430         d_u8 += 4 * dst8_stride;
1431 
1432         s0 = s8;
1433         s1 = s9;
1434         s2 = s10;
1435         s3 = s11;
1436         s4 = s12;
1437         s += 8 * src_stride;
1438         d += 8 * dst_stride;
1439         height -= 8;
1440 #else   // !AOM_ARCH_AARCH64
1441         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1442 
1443         uint16x8_t d0 =
1444             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1445 
1446         s0 = s1;
1447         s1 = s2;
1448         s2 = s3;
1449         s3 = s4;
1450         s4 = s5;
1451 
1452         uint16x8_t dd0 = vld1q_u16(d);
1453 
1454         uint8x8_t d0_u8;
1455         compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
1456                                  round_offset_vec, &d0_u8);
1457 
1458         vst1_u8(d_u8, d0_u8);
1459         d_u8 += dst8_stride;
1460 
1461         s += src_stride;
1462         d += dst_stride;
1463         height--;
1464 #endif  // AOM_ARCH_AARCH64
1465       } while (height != 0);
1466       src_ptr += 8;
1467       dst_ptr += 8;
1468       dst8_ptr += 8;
1469       width -= 8;
1470     } while (width != 0);
1471   }
1472 }
1473 
dist_wtd_convolve_y_6tap_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1474 static inline void dist_wtd_convolve_y_6tap_avg_neon(
1475     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1476     const int dst8_stride, int w, int h, const int16x8_t y_filter,
1477     ConvolveParams *conv_params) {
1478   const int bd = 8;
1479   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1480   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1481                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1482   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1483 
1484   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1485   const int dst_stride = conv_params->dst_stride;
1486   int width = w;
1487 
1488   if (w == 4 || h == 4) {
1489     do {
1490       const uint8_t *s = src_ptr;
1491       CONV_BUF_TYPE *d = dst_ptr;
1492       uint8_t *d_u8 = dst8_ptr;
1493       int height = h;
1494 
1495       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1496       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1497       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1498       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1499       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1500 
1501       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1502       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1503       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1504       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1505       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1506 
1507       s += 5 * src_stride;
1508 
1509       do {
1510 #if AOM_ARCH_AARCH64
1511         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1512         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1513         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1514         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1515 
1516         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1517         int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1518         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1519         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1520 
1521         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1522                                       vget_low_s16(round_offset_vec));
1523         uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1524                                       vget_low_s16(round_offset_vec));
1525         uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1526                                       vget_low_s16(round_offset_vec));
1527         uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1528                                       vget_low_s16(round_offset_vec));
1529 
1530         uint16x4_t dd0, dd1, dd2, dd3;
1531         load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1532 
1533         uint8x8_t d01, d23;
1534         compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
1535                               round_offset_vec, &d01, &d23);
1536 
1537         store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1538         store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1539 
1540         s0 = s4;
1541         s1 = s5;
1542         s2 = s6;
1543         s3 = s7;
1544         s4 = s8;
1545         s += 4 * src_stride;
1546         d += 4 * dst_stride;
1547         d_u8 += 4 * dst8_stride;
1548         height -= 4;
1549 #else   // !AOM_ARCH_AARCH64
1550         t0 = load_unaligned_u8_4x1(s);
1551         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1552 
1553         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1554                                       vget_low_s16(round_offset_vec));
1555 
1556         uint16x4_t dd0 = vld1_u16(d);
1557 
1558         uint8x8_t d01;
1559         compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
1560 
1561         store_u8_4x1(d_u8, d01);
1562 
1563         s0 = s1;
1564         s1 = s2;
1565         s2 = s3;
1566         s3 = s4;
1567         s4 = s5;
1568         s += src_stride;
1569         d += dst_stride;
1570         d_u8 += dst8_stride;
1571         height--;
1572 #endif  // AOM_ARCH_AARCH64
1573       } while (height != 0);
1574       src_ptr += 4;
1575       dst_ptr += 4;
1576       dst8_ptr += 4;
1577       width -= 4;
1578     } while (width != 0);
1579   } else {
1580     do {
1581       const uint8_t *s = src_ptr + (5 * src_stride);
1582       CONV_BUF_TYPE *d = dst_ptr;
1583       uint8_t *d_u8 = dst8_ptr;
1584       int height = h;
1585 
1586       uint8x8_t t0, t1, t2, t3, t4;
1587       load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1588 
1589       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1590       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1591       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1592       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1593       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1594 
1595       do {
1596 #if AOM_ARCH_AARCH64
1597         uint8x8_t t5, t6, t7;
1598         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1599 
1600         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1601         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1602         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1603         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1604         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1605         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1606         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1607         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1608 
1609         uint16x8_t d0 =
1610             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1611         uint16x8_t d1 =
1612             convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1613         uint16x8_t d2 =
1614             convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1615         uint16x8_t d3 =
1616             convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1617         uint16x8_t d4 =
1618             convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1619         uint16x8_t d5 =
1620             convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1621         uint16x8_t d6 =
1622             convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1623         uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1624                                       round_offset_vec);
1625 
1626         uint16x8_t dd0, dd1, dd2, dd3;
1627         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1628 
1629         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
1630         compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
1631                               round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
1632 
1633         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
1634         d_u8 += 4 * dst8_stride;
1635 
1636         uint16x8_t dd4, dd5, dd6, dd7;
1637         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
1638 
1639         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
1640         compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
1641                               round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
1642 
1643         store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
1644         d_u8 += 4 * dst8_stride;
1645 
1646         s0 = s8;
1647         s1 = s9;
1648         s2 = s10;
1649         s3 = s11;
1650         s4 = s12;
1651         s += 8 * src_stride;
1652         d += 8 * dst_stride;
1653         height -= 8;
1654 #else   // !AOM_ARCH_AARCH64
1655         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1656 
1657         uint16x8_t d0 =
1658             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1659 
1660         s0 = s1;
1661         s1 = s2;
1662         s2 = s3;
1663         s3 = s4;
1664         s4 = s5;
1665 
1666         uint16x8_t dd0 = vld1q_u16(d);
1667 
1668         uint8x8_t d0_u8;
1669         compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
1670 
1671         vst1_u8(d_u8, d0_u8);
1672         d_u8 += dst8_stride;
1673 
1674         s += src_stride;
1675         d += dst_stride;
1676         height--;
1677 #endif  // AOM_ARCH_AARCH64
1678       } while (height != 0);
1679       src_ptr += 8;
1680       dst_ptr += 8;
1681       dst8_ptr += 8;
1682       width -= 8;
1683     } while (width != 0);
1684   }
1685 }
1686 
dist_wtd_convolve_y_6tap_neon(const uint8_t * src_ptr,int src_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1687 static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
1688                                                  int src_stride, int w, int h,
1689                                                  const int16x8_t y_filter,
1690                                                  ConvolveParams *conv_params) {
1691   const int bd = 8;
1692   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1693   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1694                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1695   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1696 
1697   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1698   const int dst_stride = conv_params->dst_stride;
1699   int width = w;
1700 
1701   if (w == 4 || h == 4) {
1702     do {
1703       const uint8_t *s = src_ptr;
1704       CONV_BUF_TYPE *d = dst_ptr;
1705       int height = h;
1706 
1707       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1708       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1709       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1710       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1711       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1712 
1713       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1714       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1715       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1716       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1717       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1718 
1719       s += 5 * src_stride;
1720 
1721       do {
1722 #if AOM_ARCH_AARCH64
1723         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1724         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1725         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1726         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1727 
1728         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1729         int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1730         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1731         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1732 
1733         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1734                                       vget_low_s16(round_offset_vec));
1735         uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1736                                       vget_low_s16(round_offset_vec));
1737         uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1738                                       vget_low_s16(round_offset_vec));
1739         uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1740                                       vget_low_s16(round_offset_vec));
1741 
1742         store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1743 
1744         s0 = s4;
1745         s1 = s5;
1746         s2 = s6;
1747         s3 = s7;
1748         s4 = s8;
1749         s += 4 * src_stride;
1750         d += 4 * dst_stride;
1751         height -= 4;
1752 #else   // !AOM_ARCH_AARCH64
1753         t0 = load_unaligned_u8_4x1(s);
1754         int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1755 
1756         uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1757                                       vget_low_s16(round_offset_vec));
1758 
1759         vst1_u16(d, d0);
1760 
1761         s0 = s1;
1762         s1 = s2;
1763         s2 = s3;
1764         s3 = s4;
1765         s4 = s5;
1766         s += src_stride;
1767         d += dst_stride;
1768         height--;
1769 #endif  // AOM_ARCH_AARCH64
1770       } while (height != 0);
1771       src_ptr += 4;
1772       dst_ptr += 4;
1773       width -= 4;
1774     } while (width != 0);
1775   } else {
1776     do {
1777       const uint8_t *s = src_ptr + (5 * src_stride);
1778       CONV_BUF_TYPE *d = dst_ptr;
1779       int height = h;
1780 
1781       uint8x8_t t0, t1, t2, t3, t4;
1782       load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1783 
1784       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1785       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1786       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1787       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1788       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1789 
1790       do {
1791 #if AOM_ARCH_AARCH64
1792         uint8x8_t t5, t6, t7;
1793         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1794 
1795         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1796         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1797         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1798         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1799         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1800         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1801         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1802         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1803 
1804         uint16x8_t d0 =
1805             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1806         uint16x8_t d1 =
1807             convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1808         uint16x8_t d2 =
1809             convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1810         uint16x8_t d3 =
1811             convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1812         uint16x8_t d4 =
1813             convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1814         uint16x8_t d5 =
1815             convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1816         uint16x8_t d6 =
1817             convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1818         uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1819                                       round_offset_vec);
1820 
1821         store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1822 
1823         s0 = s8;
1824         s1 = s9;
1825         s2 = s10;
1826         s3 = s11;
1827         s4 = s12;
1828         s += 8 * src_stride;
1829         d += 8 * dst_stride;
1830         height -= 8;
1831 #else   // !AOM_ARCH_AARCH64
1832         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1833 
1834         uint16x8_t d0 =
1835             convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1836 
1837         s0 = s1;
1838         s1 = s2;
1839         s2 = s3;
1840         s3 = s4;
1841         s4 = s5;
1842 
1843         vst1q_u16(d, d0);
1844 
1845         s += src_stride;
1846         d += dst_stride;
1847         height--;
1848 #endif  // AOM_ARCH_AARCH64
1849       } while (height != 0);
1850       src_ptr += 8;
1851       dst_ptr += 8;
1852       width -= 8;
1853     } while (width != 0);
1854   }
1855 }
1856 
convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int16x4_t round_offset)1857 static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
1858                                        const int16x4_t s2, const int16x4_t s3,
1859                                        const int16x4_t s4, const int16x4_t s5,
1860                                        const int16x4_t s6, const int16x4_t s7,
1861                                        const int16x8_t y_filter,
1862                                        const int16x4_t round_offset) {
1863   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1864   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1865 
1866   int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
1867   sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
1868   sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
1869   sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
1870   sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
1871   sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
1872   sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
1873   sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
1874 
1875   // We halved the convolution filter values so -1 from the right shift.
1876   int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
1877   return vreinterpret_u16_s16(res);
1878 }
1879 
convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int16x8_t round_offset)1880 static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
1881                                        const int16x8_t s2, const int16x8_t s3,
1882                                        const int16x8_t s4, const int16x8_t s5,
1883                                        const int16x8_t s6, const int16x8_t s7,
1884                                        const int16x8_t y_filter,
1885                                        const int16x8_t round_offset) {
1886   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1887   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1888 
1889   int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
1890   sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
1891   sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
1892   sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
1893   sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
1894   sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
1895   sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
1896   sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
1897 
1898   // We halved the convolution filter values so -1 from the right shift.
1899   int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
1900   return vreinterpretq_u16_s16(res);
1901 }
1902 
dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1903 static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
1904     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1905     const int dst8_stride, int w, int h, const int16x8_t y_filter,
1906     ConvolveParams *conv_params) {
1907   const int bd = 8;
1908   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1909   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1910                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1911   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1912 
1913   const uint16_t fwd_offset = conv_params->fwd_offset;
1914   const uint16_t bck_offset = conv_params->bck_offset;
1915 
1916   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1917   const int dst_stride = conv_params->dst_stride;
1918   int width = w;
1919 
1920   if (w == 4 || h == 4) {
1921     do {
1922       const uint8_t *s = src_ptr;
1923       CONV_BUF_TYPE *d = dst_ptr;
1924       uint8_t *d_u8 = dst8_ptr;
1925       int height = h;
1926 
1927       __builtin_prefetch(s + 0 * src_stride);
1928       __builtin_prefetch(s + 1 * src_stride);
1929       __builtin_prefetch(s + 2 * src_stride);
1930       __builtin_prefetch(s + 3 * src_stride);
1931 
1932       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1933       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1934       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1935       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1936       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1937       uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
1938       uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
1939 
1940       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1941       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1942       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1943       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1944       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1945       int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
1946       int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
1947 
1948       __builtin_prefetch(d + 0 * dst_stride);
1949       __builtin_prefetch(d + 1 * dst_stride);
1950       __builtin_prefetch(d + 2 * dst_stride);
1951       __builtin_prefetch(d + 3 * dst_stride);
1952 
1953       s += 7 * src_stride;
1954 
1955       do {
1956 #if AOM_ARCH_AARCH64
1957         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1958         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1959         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1960         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1961 
1962         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1963         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1964         int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1965         int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1966 
1967         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1968                                       vget_low_s16(round_offset_vec));
1969         uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1970                                       vget_low_s16(round_offset_vec));
1971         uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1972                                       vget_low_s16(round_offset_vec));
1973         uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1974                                       vget_low_s16(round_offset_vec));
1975 
1976         __builtin_prefetch(d + 0 * dst_stride);
1977         __builtin_prefetch(d + 1 * dst_stride);
1978         __builtin_prefetch(d + 2 * dst_stride);
1979         __builtin_prefetch(d + 3 * dst_stride);
1980 
1981         __builtin_prefetch(d_u8 + 0 * dst8_stride);
1982         __builtin_prefetch(d_u8 + 1 * dst8_stride);
1983         __builtin_prefetch(d_u8 + 2 * dst8_stride);
1984         __builtin_prefetch(d_u8 + 3 * dst8_stride);
1985 
1986         uint16x4_t dd0, dd1, dd2, dd3;
1987         load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1988 
1989         uint8x8_t d01, d23;
1990         compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1991                                  bck_offset, round_offset_vec, &d01, &d23);
1992 
1993         store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1994         store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1995 
1996         s0 = s4;
1997         s1 = s5;
1998         s2 = s6;
1999         s3 = s7;
2000         s4 = s8;
2001         s5 = s9;
2002         s6 = s10;
2003         s += 4 * src_stride;
2004         d += 4 * dst_stride;
2005         d_u8 += 4 * dst8_stride;
2006         height -= 4;
2007 #else   // !AOM_ARCH_AARCH64
2008         t0 = load_unaligned_u8_4x1(s);
2009         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2010 
2011         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2012                                       vget_low_s16(round_offset_vec));
2013 
2014         __builtin_prefetch(d);
2015 
2016         uint16x4_t dd0 = vld1_u16(d);
2017 
2018         uint8x8_t d01;
2019         compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
2020                                  vget_low_s16(round_offset_vec), &d01);
2021 
2022         store_u8_4x1(d_u8, d01);
2023 
2024         s0 = s1;
2025         s1 = s2;
2026         s2 = s3;
2027         s3 = s4;
2028         s4 = s5;
2029         s5 = s6;
2030         s6 = s7;
2031         s += src_stride;
2032         d += dst_stride;
2033         d_u8 += dst8_stride;
2034         height--;
2035 #endif  // AOM_ARCH_AARCH64
2036       } while (height != 0);
2037       src_ptr += 4;
2038       dst_ptr += 4;
2039       dst8_ptr += 4;
2040       width -= 4;
2041     } while (width != 0);
2042   } else {
2043     do {
2044       const uint8_t *s = src_ptr;
2045       CONV_BUF_TYPE *d = dst_ptr;
2046       uint8_t *d_u8 = dst8_ptr;
2047       int height = h;
2048 
2049       __builtin_prefetch(s + 0 * src_stride);
2050       __builtin_prefetch(s + 1 * src_stride);
2051       __builtin_prefetch(s + 2 * src_stride);
2052       __builtin_prefetch(s + 3 * src_stride);
2053       __builtin_prefetch(s + 4 * src_stride);
2054       __builtin_prefetch(s + 5 * src_stride);
2055       __builtin_prefetch(s + 6 * src_stride);
2056       __builtin_prefetch(s + 7 * src_stride);
2057 
2058       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2059       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2060 
2061       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2062       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2063       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2064       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2065       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2066       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2067       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2068 
2069       s += 7 * src_stride;
2070 
2071       do {
2072 #if AOM_ARCH_AARCH64
2073         uint8x8_t t7;
2074         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2075 
2076         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2077         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2078         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2079         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2080         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2081         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2082         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2083         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2084 
2085         __builtin_prefetch(dst_ptr + 0 * dst_stride);
2086         __builtin_prefetch(dst_ptr + 1 * dst_stride);
2087         __builtin_prefetch(dst_ptr + 2 * dst_stride);
2088         __builtin_prefetch(dst_ptr + 3 * dst_stride);
2089 
2090         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2091                                       round_offset_vec);
2092         uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2093                                       round_offset_vec);
2094         uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2095                                       round_offset_vec);
2096         uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2097                                       round_offset_vec);
2098         uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2099                                       y_filter, round_offset_vec);
2100         uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2101                                       y_filter, round_offset_vec);
2102         uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2103                                       y_filter, round_offset_vec);
2104         uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2105                                       y_filter, round_offset_vec);
2106 
2107         __builtin_prefetch(d + 0 * dst8_stride);
2108         __builtin_prefetch(d + 1 * dst8_stride);
2109         __builtin_prefetch(d + 2 * dst8_stride);
2110         __builtin_prefetch(d + 3 * dst8_stride);
2111 
2112         uint16x8_t dd0, dd1, dd2, dd3;
2113         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2114 
2115         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
2116         compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
2117                                  bck_offset, round_offset_vec, &d0_u8, &d1_u8,
2118                                  &d2_u8, &d3_u8);
2119 
2120         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
2121         d_u8 += 4 * dst8_stride;
2122 
2123         uint16x8_t dd4, dd5, dd6, dd7;
2124         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
2125 
2126         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
2127         compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
2128                                  bck_offset, round_offset_vec, &d4_u8, &d5_u8,
2129                                  &d6_u8, &d7_u8);
2130 
2131         store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
2132         d_u8 += 4 * dst8_stride;
2133 
2134         s0 = s8;
2135         s1 = s9;
2136         s2 = s10;
2137         s3 = s11;
2138         s4 = s12;
2139         s5 = s13;
2140         s6 = s14;
2141         s += 8 * src_stride;
2142         d += 8 * dst_stride;
2143         height -= 8;
2144 #else   // !AOM_ARCH_AARCH64
2145         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2146 
2147         __builtin_prefetch(dst_ptr);
2148 
2149         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2150                                       round_offset_vec);
2151 
2152         s0 = s1;
2153         s1 = s2;
2154         s2 = s3;
2155         s3 = s4;
2156         s4 = s5;
2157         s5 = s6;
2158         s6 = s7;
2159 
2160         __builtin_prefetch(d);
2161 
2162         uint16x8_t dd0 = vld1q_u16(d);
2163 
2164         uint8x8_t d0_u8;
2165         compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
2166                                  round_offset_vec, &d0_u8);
2167 
2168         vst1_u8(d_u8, d0_u8);
2169         d_u8 += dst8_stride;
2170 
2171         s += src_stride;
2172         d += dst_stride;
2173         height--;
2174 #endif  // AOM_ARCH_AARCH64
2175       } while (height != 0);
2176       src_ptr += 8;
2177       dst_ptr += 8;
2178       dst8_ptr += 8;
2179       width -= 8;
2180     } while (width != 0);
2181   }
2182 }
2183 
dist_wtd_convolve_y_8tap_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)2184 static inline void dist_wtd_convolve_y_8tap_avg_neon(
2185     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
2186     const int dst8_stride, int w, int h, const int16x8_t y_filter,
2187     ConvolveParams *conv_params) {
2188   const int bd = 8;
2189   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
2190   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
2191                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
2192   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
2193 
2194   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
2195   const int dst_stride = conv_params->dst_stride;
2196   int width = w;
2197 
2198   if (w == 4 || h == 4) {
2199     do {
2200       const uint8_t *s = src_ptr;
2201       CONV_BUF_TYPE *d = dst_ptr;
2202       uint8_t *d_u8 = dst8_ptr;
2203       int height = h;
2204 
2205       __builtin_prefetch(s + 0 * src_stride);
2206       __builtin_prefetch(s + 1 * src_stride);
2207       __builtin_prefetch(s + 2 * src_stride);
2208       __builtin_prefetch(s + 3 * src_stride);
2209 
2210       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2211       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2212       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2213       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2214       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
2215       uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
2216       uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
2217 
2218       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2219       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2220       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2221       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2222       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
2223       int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
2224       int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
2225 
2226       __builtin_prefetch(d + 0 * dst_stride);
2227       __builtin_prefetch(d + 1 * dst_stride);
2228       __builtin_prefetch(d + 2 * dst_stride);
2229       __builtin_prefetch(d + 3 * dst_stride);
2230 
2231       s += 7 * src_stride;
2232 
2233       do {
2234 #if AOM_ARCH_AARCH64
2235         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2236         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2237         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2238         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2239 
2240         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2241         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2242         int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2243         int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2244 
2245         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2246                                       vget_low_s16(round_offset_vec));
2247         uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2248                                       vget_low_s16(round_offset_vec));
2249         uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2250                                       vget_low_s16(round_offset_vec));
2251         uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2252                                       vget_low_s16(round_offset_vec));
2253 
2254         __builtin_prefetch(d + 0 * dst_stride);
2255         __builtin_prefetch(d + 1 * dst_stride);
2256         __builtin_prefetch(d + 2 * dst_stride);
2257         __builtin_prefetch(d + 3 * dst_stride);
2258 
2259         __builtin_prefetch(d_u8 + 0 * dst8_stride);
2260         __builtin_prefetch(d_u8 + 1 * dst8_stride);
2261         __builtin_prefetch(d_u8 + 2 * dst8_stride);
2262         __builtin_prefetch(d_u8 + 3 * dst8_stride);
2263 
2264         uint16x4_t dd0, dd1, dd2, dd3;
2265         load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2266 
2267         uint8x8_t d01, d23;
2268         compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
2269                               round_offset_vec, &d01, &d23);
2270 
2271         store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
2272         store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
2273 
2274         s0 = s4;
2275         s1 = s5;
2276         s2 = s6;
2277         s3 = s7;
2278         s4 = s8;
2279         s5 = s9;
2280         s6 = s10;
2281         s += 4 * src_stride;
2282         d += 4 * dst_stride;
2283         d_u8 += 4 * dst8_stride;
2284         height -= 4;
2285 #else   // !AOM_ARCH_AARCH64
2286         t0 = load_unaligned_u8_4x1(s);
2287         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2288 
2289         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2290                                       vget_low_s16(round_offset_vec));
2291 
2292         __builtin_prefetch(d);
2293 
2294         uint16x4_t dd0 = vld1_u16(d);
2295 
2296         uint8x8_t d01;
2297         compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
2298 
2299         store_u8_4x1(d_u8, d01);
2300 
2301         s0 = s1;
2302         s1 = s2;
2303         s2 = s3;
2304         s3 = s4;
2305         s4 = s5;
2306         s5 = s6;
2307         s6 = s7;
2308         s += src_stride;
2309         d += dst_stride;
2310         d_u8 += dst8_stride;
2311         height--;
2312 #endif  // AOM_ARCH_AARCH64
2313       } while (height != 0);
2314       src_ptr += 4;
2315       dst_ptr += 4;
2316       dst8_ptr += 4;
2317       width -= 4;
2318     } while (width != 0);
2319   } else {
2320     do {
2321       const uint8_t *s = src_ptr;
2322       CONV_BUF_TYPE *d = dst_ptr;
2323       uint8_t *d_u8 = dst8_ptr;
2324       int height = h;
2325 
2326       __builtin_prefetch(s + 0 * src_stride);
2327       __builtin_prefetch(s + 1 * src_stride);
2328       __builtin_prefetch(s + 2 * src_stride);
2329       __builtin_prefetch(s + 3 * src_stride);
2330       __builtin_prefetch(s + 4 * src_stride);
2331       __builtin_prefetch(s + 5 * src_stride);
2332       __builtin_prefetch(s + 6 * src_stride);
2333       __builtin_prefetch(s + 7 * src_stride);
2334 
2335       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2336       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2337 
2338       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2339       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2340       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2341       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2342       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2343       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2344       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2345 
2346       s += 7 * src_stride;
2347 
2348       do {
2349 #if AOM_ARCH_AARCH64
2350         uint8x8_t t7;
2351         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2352 
2353         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2354         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2355         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2356         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2357         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2358         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2359         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2360         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2361 
2362         __builtin_prefetch(dst_ptr + 0 * dst_stride);
2363         __builtin_prefetch(dst_ptr + 1 * dst_stride);
2364         __builtin_prefetch(dst_ptr + 2 * dst_stride);
2365         __builtin_prefetch(dst_ptr + 3 * dst_stride);
2366 
2367         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2368                                       round_offset_vec);
2369         uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2370                                       round_offset_vec);
2371         uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2372                                       round_offset_vec);
2373         uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2374                                       round_offset_vec);
2375         uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2376                                       y_filter, round_offset_vec);
2377         uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2378                                       y_filter, round_offset_vec);
2379         uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2380                                       y_filter, round_offset_vec);
2381         uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2382                                       y_filter, round_offset_vec);
2383 
2384         __builtin_prefetch(d + 0 * dst8_stride);
2385         __builtin_prefetch(d + 1 * dst8_stride);
2386         __builtin_prefetch(d + 2 * dst8_stride);
2387         __builtin_prefetch(d + 3 * dst8_stride);
2388 
2389         uint16x8_t dd0, dd1, dd2, dd3;
2390         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2391 
2392         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
2393         compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
2394                               round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
2395 
2396         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
2397         d_u8 += 4 * dst8_stride;
2398 
2399         uint16x8_t dd4, dd5, dd6, dd7;
2400         load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
2401 
2402         uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
2403         compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
2404                               round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
2405 
2406         store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
2407         d_u8 += 4 * dst8_stride;
2408 
2409         s0 = s8;
2410         s1 = s9;
2411         s2 = s10;
2412         s3 = s11;
2413         s4 = s12;
2414         s5 = s13;
2415         s6 = s14;
2416         s += 8 * src_stride;
2417         d += 8 * dst_stride;
2418         height -= 8;
2419 #else   // !AOM_ARCH_AARCH64
2420         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2421 
2422         __builtin_prefetch(dst_ptr);
2423 
2424         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2425                                       round_offset_vec);
2426 
2427         s0 = s1;
2428         s1 = s2;
2429         s2 = s3;
2430         s3 = s4;
2431         s4 = s5;
2432         s5 = s6;
2433         s6 = s7;
2434 
2435         __builtin_prefetch(d);
2436 
2437         uint16x8_t dd0 = vld1q_u16(d);
2438 
2439         uint8x8_t d0_u8;
2440         compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
2441 
2442         vst1_u8(d_u8, d0_u8);
2443         d_u8 += dst8_stride;
2444 
2445         s += src_stride;
2446         d += dst_stride;
2447         height--;
2448 #endif  // AOM_ARCH_AARCH64
2449       } while (height != 0);
2450       src_ptr += 8;
2451       dst_ptr += 8;
2452       dst8_ptr += 8;
2453       width -= 8;
2454     } while (width != 0);
2455   }
2456 }
2457 
dist_wtd_convolve_y_8tap_neon(const uint8_t * src_ptr,int src_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)2458 static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
2459                                                  int src_stride, int w, int h,
2460                                                  const int16x8_t y_filter,
2461                                                  ConvolveParams *conv_params) {
2462   const int bd = 8;
2463   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
2464   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
2465                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
2466   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
2467 
2468   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
2469   const int dst_stride = conv_params->dst_stride;
2470   int width = w;
2471 
2472   if (w == 4 || h == 4) {
2473     do {
2474       const uint8_t *s = src_ptr;
2475       CONV_BUF_TYPE *d = dst_ptr;
2476       int height = h;
2477 
2478       __builtin_prefetch(s + 0 * src_stride);
2479       __builtin_prefetch(s + 1 * src_stride);
2480       __builtin_prefetch(s + 2 * src_stride);
2481       __builtin_prefetch(s + 3 * src_stride);
2482 
2483       uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2484       uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2485       uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2486       uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2487       uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
2488       uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
2489       uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
2490 
2491       int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2492       int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2493       int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2494       int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2495       int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
2496       int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
2497       int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
2498 
2499       __builtin_prefetch(d + 0 * dst_stride);
2500       __builtin_prefetch(d + 1 * dst_stride);
2501       __builtin_prefetch(d + 2 * dst_stride);
2502       __builtin_prefetch(d + 3 * dst_stride);
2503 
2504       s += 7 * src_stride;
2505 
2506       do {
2507 #if AOM_ARCH_AARCH64
2508         t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2509         t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2510         t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2511         t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2512 
2513         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2514         int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2515         int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2516         int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2517 
2518         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2519                                       vget_low_s16(round_offset_vec));
2520         uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2521                                       vget_low_s16(round_offset_vec));
2522         uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2523                                       vget_low_s16(round_offset_vec));
2524         uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2525                                       vget_low_s16(round_offset_vec));
2526 
2527         store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
2528 
2529         s0 = s4;
2530         s1 = s5;
2531         s2 = s6;
2532         s3 = s7;
2533         s4 = s8;
2534         s5 = s9;
2535         s6 = s10;
2536         s += 4 * src_stride;
2537         d += 4 * dst_stride;
2538         height -= 4;
2539 #else   // !AOM_ARCH_AARCH64
2540         t0 = load_unaligned_u8_4x1(s);
2541         int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2542 
2543         uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2544                                       vget_low_s16(round_offset_vec));
2545 
2546         vst1_u16(d, d0);
2547 
2548         s0 = s1;
2549         s1 = s2;
2550         s2 = s3;
2551         s3 = s4;
2552         s4 = s5;
2553         s5 = s6;
2554         s6 = s7;
2555         s += src_stride;
2556         d += dst_stride;
2557         height--;
2558 #endif  // AOM_ARCH_AARCH64
2559       } while (height != 0);
2560       src_ptr += 4;
2561       dst_ptr += 4;
2562       width -= 4;
2563     } while (width != 0);
2564   } else {
2565     do {
2566       const uint8_t *s = src_ptr;
2567       CONV_BUF_TYPE *d = dst_ptr;
2568       int height = h;
2569 
2570       __builtin_prefetch(s + 0 * src_stride);
2571       __builtin_prefetch(s + 1 * src_stride);
2572       __builtin_prefetch(s + 2 * src_stride);
2573       __builtin_prefetch(s + 3 * src_stride);
2574       __builtin_prefetch(s + 4 * src_stride);
2575       __builtin_prefetch(s + 5 * src_stride);
2576       __builtin_prefetch(s + 6 * src_stride);
2577       __builtin_prefetch(s + 7 * src_stride);
2578 
2579       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2580       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2581 
2582       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2583       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2584       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2585       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2586       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2587       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2588       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2589 
2590       s += 7 * src_stride;
2591 
2592       do {
2593 #if AOM_ARCH_AARCH64
2594         uint8x8_t t7;
2595         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2596 
2597         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2598         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2599         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2600         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2601         int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2602         int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2603         int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2604         int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2605 
2606         __builtin_prefetch(dst_ptr + 0 * dst_stride);
2607         __builtin_prefetch(dst_ptr + 1 * dst_stride);
2608         __builtin_prefetch(dst_ptr + 2 * dst_stride);
2609         __builtin_prefetch(dst_ptr + 3 * dst_stride);
2610 
2611         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2612                                       round_offset_vec);
2613         uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2614                                       round_offset_vec);
2615         uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2616                                       round_offset_vec);
2617         uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2618                                       round_offset_vec);
2619         uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2620                                       y_filter, round_offset_vec);
2621         uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2622                                       y_filter, round_offset_vec);
2623         uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2624                                       y_filter, round_offset_vec);
2625         uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2626                                       y_filter, round_offset_vec);
2627 
2628         store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
2629 
2630         s0 = s8;
2631         s1 = s9;
2632         s2 = s10;
2633         s3 = s11;
2634         s4 = s12;
2635         s5 = s13;
2636         s6 = s14;
2637         s += 8 * src_stride;
2638         d += 8 * dst_stride;
2639         height -= 8;
2640 #else   // !AOM_ARCH_AARCH64
2641         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2642 
2643         __builtin_prefetch(dst_ptr);
2644 
2645         uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2646                                       round_offset_vec);
2647 
2648         s0 = s1;
2649         s1 = s2;
2650         s2 = s3;
2651         s3 = s4;
2652         s4 = s5;
2653         s5 = s6;
2654         s6 = s7;
2655 
2656         vst1q_u16(d, d0);
2657 
2658         s += src_stride;
2659         d += dst_stride;
2660         height--;
2661 #endif  // AOM_ARCH_AARCH64
2662       } while (height != 0);
2663       src_ptr += 8;
2664       dst_ptr += 8;
2665       width -= 8;
2666     } while (width != 0);
2667   }
2668 }
2669 
av1_dist_wtd_convolve_y_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)2670 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
2671                                   uint8_t *dst8, int dst8_stride, int w, int h,
2672                                   const InterpFilterParams *filter_params_y,
2673                                   const int subpel_y_qn,
2674                                   ConvolveParams *conv_params) {
2675   assert(w % 4 == 0);
2676   assert(h % 4 == 0);
2677 
2678   // Vertical filter.
2679   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
2680       filter_params_y, subpel_y_qn & SUBPEL_MASK);
2681   // Filter values are even, so downshift by 1 to reduce intermediate
2682   // precision requirements.
2683   const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
2684 
2685   const int vert_offset = filter_params_y->taps / 2 - 1;
2686   const uint8_t *src_ptr = src - (vert_offset * src_stride);
2687 
2688   if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
2689     if (conv_params->do_average) {
2690       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
2691         dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
2692             src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
2693             conv_params);
2694       } else {
2695         dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
2696                                           dst8, dst8_stride, w, h, y_filter,
2697                                           conv_params);
2698       }
2699     } else {
2700       dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
2701                                     y_filter, conv_params);
2702     }
2703   } else {
2704     if (conv_params->do_average) {
2705       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
2706         dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
2707                                                    dst8_stride, w, h, y_filter,
2708                                                    conv_params);
2709       } else {
2710         dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
2711                                           dst8_stride, w, h, y_filter,
2712                                           conv_params);
2713       }
2714     } else {
2715       dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
2716                                     conv_params);
2717     }
2718   }
2719 }
2720