xref: /aosp_15_r20/external/libaom/av1/common/arm/highbd_compound_convolve_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 #include "av1/common/arm/highbd_compound_convolve_neon.h"
24 #include "av1/common/arm/highbd_convolve_neon.h"
25 
highbd_12_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)26 static inline uint16x4_t highbd_12_convolve6_4(
27     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
28     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
29     const int16x8_t filter, const int32x4_t offset) {
30   // Values at indices 0 and 7 of y_filter are zero.
31   const int16x4_t filter_0_3 = vget_low_s16(filter);
32   const int16x4_t filter_4_7 = vget_high_s16(filter);
33 
34   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
35   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
36   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
37   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
38   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
39   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
40 
41   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
42 }
43 
highbd_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)44 static inline uint16x4_t highbd_convolve6_4(
45     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
46     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
47     const int16x8_t filter, const int32x4_t offset) {
48   // Values at indices 0 and 7 of y_filter are zero.
49   const int16x4_t filter_0_3 = vget_low_s16(filter);
50   const int16x4_t filter_4_7 = vget_high_s16(filter);
51 
52   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
53   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
54   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
55   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
56   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
57   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
58 
59   return vqshrun_n_s32(sum, ROUND0_BITS);
60 }
61 
highbd_12_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)62 static inline uint16x8_t highbd_12_convolve6_8(
63     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
64     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
65     const int16x8_t filter, const int32x4_t offset) {
66   // Values at indices 0 and 7 of y_filter are zero.
67   const int16x4_t filter_0_3 = vget_low_s16(filter);
68   const int16x4_t filter_4_7 = vget_high_s16(filter);
69 
70   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
71   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
72   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
73   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
74   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
75   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
76 
77   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
78   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
79   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
80   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
81   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
82   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
83 
84   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
85                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
86 }
87 
highbd_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)88 static inline uint16x8_t highbd_convolve6_8(
89     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
90     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
91     const int16x8_t filter, const int32x4_t offset) {
92   // Values at indices 0 and 7 of y_filter are zero.
93   const int16x4_t filter_0_3 = vget_low_s16(filter);
94   const int16x4_t filter_4_7 = vget_high_s16(filter);
95 
96   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
97   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
98   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
99   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
100   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
101   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
102 
103   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
104   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
105   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
106   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
107   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
108   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
109 
110   return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
111 }
112 
highbd_12_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)113 static inline void highbd_12_dist_wtd_convolve_x_6tap_neon(
114     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
115     int w, int h, const int16_t *x_filter_ptr, const int offset) {
116   const int32x4_t offset_vec = vdupq_n_s32(offset);
117 
118   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
119 
120   int height = h;
121 
122   do {
123     int width = w;
124     const int16_t *s = (const int16_t *)src_ptr;
125     uint16_t *d = dst_ptr;
126 
127     do {
128       int16x8_t s0[6], s1[6], s2[6], s3[6];
129       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
130                    &s0[4], &s0[5]);
131       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
132                    &s1[4], &s1[5]);
133       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
134                    &s2[4], &s2[5]);
135       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
136                    &s3[4], &s3[5]);
137 
138       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
139                                             s0[5], x_filter, offset_vec);
140       uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
141                                             s1[5], x_filter, offset_vec);
142       uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
143                                             s2[5], x_filter, offset_vec);
144       uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
145                                             s3[5], x_filter, offset_vec);
146 
147       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
148 
149       s += 8;
150       d += 8;
151       width -= 8;
152     } while (width != 0);
153     src_ptr += 4 * src_stride;
154     dst_ptr += 4 * dst_stride;
155     height -= 4;
156   } while (height != 0);
157 }
158 
highbd_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)159 static inline void highbd_dist_wtd_convolve_x_6tap_neon(
160     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
161     int w, int h, const int16_t *x_filter_ptr, const int offset) {
162   const int32x4_t offset_vec = vdupq_n_s32(offset);
163 
164   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
165 
166   int height = h;
167 
168   do {
169     int width = w;
170     const int16_t *s = (const int16_t *)src_ptr;
171     uint16_t *d = dst_ptr;
172 
173     do {
174       int16x8_t s0[6], s1[6], s2[6], s3[6];
175       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
176                    &s0[4], &s0[5]);
177       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
178                    &s1[4], &s1[5]);
179       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
180                    &s2[4], &s2[5]);
181       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
182                    &s3[4], &s3[5]);
183 
184       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
185                                          s0[5], x_filter, offset_vec);
186       uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
187                                          s1[5], x_filter, offset_vec);
188       uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
189                                          s2[5], x_filter, offset_vec);
190       uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
191                                          s3[5], x_filter, offset_vec);
192 
193       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
194 
195       s += 8;
196       d += 8;
197       width -= 8;
198     } while (width != 0);
199     src_ptr += 4 * src_stride;
200     dst_ptr += 4 * dst_stride;
201     height -= 4;
202   } while (height != 0);
203 }
204 
highbd_12_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)205 static inline uint16x4_t highbd_12_convolve8_4(
206     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
207     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
208     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
209     const int32x4_t offset) {
210   const int16x4_t filter_0_3 = vget_low_s16(filter);
211   const int16x4_t filter_4_7 = vget_high_s16(filter);
212 
213   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
214   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
215   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
216   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
217   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
218   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
219   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
220   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
221 
222   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
223 }
224 
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)225 static inline uint16x4_t highbd_convolve8_4(
226     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
227     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
228     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
229     const int32x4_t offset) {
230   const int16x4_t filter_0_3 = vget_low_s16(filter);
231   const int16x4_t filter_4_7 = vget_high_s16(filter);
232 
233   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
234   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
235   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
236   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
237   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
238   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
239   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
240   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
241 
242   return vqshrun_n_s32(sum, ROUND0_BITS);
243 }
244 
highbd_12_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)245 static inline uint16x8_t highbd_12_convolve8_8(
246     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
247     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
248     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
249     const int32x4_t offset) {
250   const int16x4_t filter_0_3 = vget_low_s16(filter);
251   const int16x4_t filter_4_7 = vget_high_s16(filter);
252 
253   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
254   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
255   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
256   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
257   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
258   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
259   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
260   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
261 
262   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
263   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
264   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
265   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
266   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
267   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
268   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
269   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
270 
271   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
272                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
273 }
274 
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)275 static inline uint16x8_t highbd_convolve8_8(
276     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
277     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
278     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
279     const int32x4_t offset) {
280   const int16x4_t filter_0_3 = vget_low_s16(filter);
281   const int16x4_t filter_4_7 = vget_high_s16(filter);
282 
283   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
284   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
285   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
286   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
287   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
288   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
289   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
290   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
291 
292   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
293   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
294   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
295   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
296   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
297   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
298   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
299   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
300 
301   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
302                       vqshrun_n_s32(sum1, ROUND0_BITS));
303 }
304 
highbd_12_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)305 static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
306                                                  const int16x4_t x_filter,
307                                                  const int32x4_t offset) {
308   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
309   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
310   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
311   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
312 
313   return vqshrun_n_s32(sum, 5);
314 }
315 
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)316 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
317                                               const int16x4_t x_filter,
318                                               const int32x4_t offset) {
319   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
320   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
321   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
322   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
323 
324   return vqshrun_n_s32(sum, ROUND0_BITS);
325 }
326 
highbd_12_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)327 static inline void highbd_12_dist_wtd_convolve_x_neon(
328     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
329     int w, int h, const int16_t *x_filter_ptr, const int offset) {
330   const int32x4_t offset_vec = vdupq_n_s32(offset);
331 
332   if (w == 4) {
333     // 4-tap filters are used for blocks having width == 4.
334     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
335     const int16_t *s = (const int16_t *)(src_ptr + 2);
336     uint16_t *d = dst_ptr;
337 
338     do {
339       int16x4_t s0[4], s1[4], s2[4], s3[4];
340       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
341       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
342       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
343       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
344 
345       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
346       uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
347       uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
348       uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
349 
350       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
351 
352       s += 4 * src_stride;
353       d += 4 * dst_stride;
354       h -= 4;
355     } while (h != 0);
356   } else {
357     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
358     int height = h;
359 
360     do {
361       int width = w;
362       const int16_t *s = (const int16_t *)src_ptr;
363       uint16_t *d = dst_ptr;
364 
365       do {
366         int16x8_t s0[8], s1[8], s2[8], s3[8];
367         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
368                      &s0[4], &s0[5], &s0[6], &s0[7]);
369         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
370                      &s1[4], &s1[5], &s1[6], &s1[7]);
371         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
372                      &s2[4], &s2[5], &s2[6], &s2[7]);
373         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
374                      &s3[4], &s3[5], &s3[6], &s3[7]);
375 
376         uint16x8_t d0 =
377             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
378                                   s0[6], s0[7], x_filter, offset_vec);
379         uint16x8_t d1 =
380             highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
381                                   s1[6], s1[7], x_filter, offset_vec);
382         uint16x8_t d2 =
383             highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
384                                   s2[6], s2[7], x_filter, offset_vec);
385         uint16x8_t d3 =
386             highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
387                                   s3[6], s3[7], x_filter, offset_vec);
388 
389         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
390 
391         s += 8;
392         d += 8;
393         width -= 8;
394       } while (width != 0);
395       src_ptr += 4 * src_stride;
396       dst_ptr += 4 * dst_stride;
397       height -= 4;
398     } while (height != 0);
399   }
400 }
401 
highbd_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)402 static inline void highbd_dist_wtd_convolve_x_neon(
403     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
404     int w, int h, const int16_t *x_filter_ptr, const int offset) {
405   const int32x4_t offset_vec = vdupq_n_s32(offset);
406 
407   if (w == 4) {
408     // 4-tap filters are used for blocks having width == 4.
409     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
410     const int16_t *s = (const int16_t *)(src_ptr + 2);
411     uint16_t *d = dst_ptr;
412 
413     do {
414       int16x4_t s0[4], s1[4], s2[4], s3[4];
415       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
416       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
417       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
418       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
419 
420       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
421       uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
422       uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
423       uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
424 
425       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
426 
427       s += 4 * src_stride;
428       d += 4 * dst_stride;
429       h -= 4;
430     } while (h != 0);
431   } else {
432     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
433     int height = h;
434 
435     do {
436       int width = w;
437       const int16_t *s = (const int16_t *)src_ptr;
438       uint16_t *d = dst_ptr;
439 
440       do {
441         int16x8_t s0[8], s1[8], s2[8], s3[8];
442         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
443                      &s0[4], &s0[5], &s0[6], &s0[7]);
444         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
445                      &s1[4], &s1[5], &s1[6], &s1[7]);
446         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
447                      &s2[4], &s2[5], &s2[6], &s2[7]);
448         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
449                      &s3[4], &s3[5], &s3[6], &s3[7]);
450 
451         uint16x8_t d0 =
452             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
453                                s0[7], x_filter, offset_vec);
454         uint16x8_t d1 =
455             highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
456                                s1[7], x_filter, offset_vec);
457         uint16x8_t d2 =
458             highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
459                                s2[7], x_filter, offset_vec);
460         uint16x8_t d3 =
461             highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
462                                s3[7], x_filter, offset_vec);
463 
464         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
465 
466         s += 8;
467         d += 8;
468         width -= 8;
469       } while (width != 0);
470       src_ptr += 4 * src_stride;
471       dst_ptr += 4 * dst_stride;
472       height -= 4;
473     } while (height != 0);
474   }
475 }
476 
av1_highbd_dist_wtd_convolve_x_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)477 void av1_highbd_dist_wtd_convolve_x_neon(
478     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
479     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
480     ConvolveParams *conv_params, int bd) {
481   DECLARE_ALIGNED(16, uint16_t,
482                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
483   CONV_BUF_TYPE *dst16 = conv_params->dst;
484   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
485   int dst16_stride = conv_params->dst_stride;
486   const int im_stride = MAX_SB_SIZE;
487   const int horiz_offset = filter_params_x->taps / 2 - 1;
488   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
489   const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
490                               (1 << (bd + FILTER_BITS)) +
491                               (1 << (bd + FILTER_BITS - 1));
492 
493   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
494       filter_params_x, subpel_x_qn & SUBPEL_MASK);
495 
496   src -= horiz_offset;
497 
498   // horizontal filter
499   if (bd == 12) {
500     if (conv_params->do_average) {
501       if (x_filter_taps <= 6 && w != 4) {
502         highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
503                                                 im_stride, w, h, x_filter_ptr,
504                                                 offset_convolve);
505       } else {
506         highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
507                                            w, h, x_filter_ptr, offset_convolve);
508       }
509       if (conv_params->use_dist_wtd_comp_avg) {
510         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
511                                          w, h, conv_params);
512       } else {
513         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
514                                 conv_params);
515       }
516     } else {
517       if (x_filter_taps <= 6 && w != 4) {
518         highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
519                                                 dst16_stride, w, h,
520                                                 x_filter_ptr, offset_convolve);
521       } else {
522         highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
523                                            w, h, x_filter_ptr, offset_convolve);
524       }
525     }
526   } else {
527     if (conv_params->do_average) {
528       if (x_filter_taps <= 6 && w != 4) {
529         highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
530                                              im_stride, w, h, x_filter_ptr,
531                                              offset_convolve);
532       } else {
533         highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
534                                         h, x_filter_ptr, offset_convolve);
535       }
536       if (conv_params->use_dist_wtd_comp_avg) {
537         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
538                                       h, conv_params, bd);
539       } else {
540         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
541                              conv_params, bd);
542       }
543     } else {
544       if (x_filter_taps <= 6 && w != 4) {
545         highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
546                                              dst16_stride, w, h, x_filter_ptr,
547                                              offset_convolve);
548       } else {
549         highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
550                                         h, x_filter_ptr, offset_convolve);
551       }
552     }
553   }
554 }
555 
highbd_12_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)556 static inline void highbd_12_dist_wtd_convolve_y_6tap_neon(
557     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
558     int w, int h, const int16_t *y_filter_ptr, const int offset) {
559   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
560   const int32x4_t offset_vec = vdupq_n_s32(offset);
561 
562   if (w == 4) {
563     const int16_t *s = (const int16_t *)src_ptr;
564     uint16_t *d = dst_ptr;
565 
566     int16x4_t s0, s1, s2, s3, s4;
567     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
568     s += 5 * src_stride;
569 
570     do {
571       int16x4_t s5, s6, s7, s8;
572       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
573 
574       uint16x4_t d0 =
575           highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
576       uint16x4_t d1 =
577           highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
578       uint16x4_t d2 =
579           highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
580       uint16x4_t d3 =
581           highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
582 
583       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
584 
585       s0 = s4;
586       s1 = s5;
587       s2 = s6;
588       s3 = s7;
589       s4 = s8;
590       s += 4 * src_stride;
591       d += 4 * dst_stride;
592       h -= 4;
593     } while (h != 0);
594   } else {
595     do {
596       int height = h;
597       const int16_t *s = (const int16_t *)src_ptr;
598       uint16_t *d = dst_ptr;
599 
600       int16x8_t s0, s1, s2, s3, s4;
601       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
602       s += 5 * src_stride;
603 
604       do {
605         int16x8_t s5, s6, s7, s8;
606         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
607 
608         uint16x8_t d0 =
609             highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
610         uint16x8_t d1 =
611             highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
612         uint16x8_t d2 =
613             highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
614         uint16x8_t d3 =
615             highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
616 
617         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
618 
619         s0 = s4;
620         s1 = s5;
621         s2 = s6;
622         s3 = s7;
623         s4 = s8;
624         s += 4 * src_stride;
625         d += 4 * dst_stride;
626         height -= 4;
627       } while (height != 0);
628       src_ptr += 8;
629       dst_ptr += 8;
630       w -= 8;
631     } while (w != 0);
632   }
633 }
634 
highbd_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)635 static inline void highbd_dist_wtd_convolve_y_6tap_neon(
636     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
637     int w, int h, const int16_t *y_filter_ptr, const int offset) {
638   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
639   const int32x4_t offset_vec = vdupq_n_s32(offset);
640 
641   if (w == 4) {
642     const int16_t *s = (const int16_t *)src_ptr;
643     uint16_t *d = dst_ptr;
644 
645     int16x4_t s0, s1, s2, s3, s4;
646     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
647     s += 5 * src_stride;
648 
649     do {
650       int16x4_t s5, s6, s7, s8;
651       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
652 
653       uint16x4_t d0 =
654           highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
655       uint16x4_t d1 =
656           highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
657       uint16x4_t d2 =
658           highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
659       uint16x4_t d3 =
660           highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
661 
662       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
663 
664       s0 = s4;
665       s1 = s5;
666       s2 = s6;
667       s3 = s7;
668       s4 = s8;
669       s += 4 * src_stride;
670       d += 4 * dst_stride;
671       h -= 4;
672     } while (h != 0);
673   } else {
674     do {
675       int height = h;
676       const int16_t *s = (const int16_t *)src_ptr;
677       uint16_t *d = dst_ptr;
678 
679       int16x8_t s0, s1, s2, s3, s4;
680       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
681       s += 5 * src_stride;
682 
683       do {
684         int16x8_t s5, s6, s7, s8;
685         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
686 
687         uint16x8_t d0 =
688             highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
689         uint16x8_t d1 =
690             highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
691         uint16x8_t d2 =
692             highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
693         uint16x8_t d3 =
694             highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
695 
696         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
697 
698         s0 = s4;
699         s1 = s5;
700         s2 = s6;
701         s3 = s7;
702         s4 = s8;
703         s += 4 * src_stride;
704         d += 4 * dst_stride;
705         height -= 4;
706       } while (height != 0);
707       src_ptr += 8;
708       dst_ptr += 8;
709       w -= 8;
710     } while (w != 0);
711   }
712 }
713 
highbd_12_convolve4_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filter,const int32x4_t offset)714 static inline uint16x4_t highbd_12_convolve4_4(
715     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
716     const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
717   int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
718   sum = vmlal_lane_s16(sum, s1, filter, 1);
719   sum = vmlal_lane_s16(sum, s2, filter, 2);
720   sum = vmlal_lane_s16(sum, s3, filter, 3);
721 
722   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
723 }
724 
highbd_12_convolve4_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int32x4_t offset)725 static inline uint16x8_t highbd_12_convolve4_8(
726     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
727     const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
728   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
729   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
730   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
731   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
732 
733   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
734   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
735   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
736   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
737 
738   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
739                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
740 }
741 
highbd_12_dist_wtd_convolve_y_4tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)742 static inline void highbd_12_dist_wtd_convolve_y_4tap_neon(
743     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
744     int w, int h, const int16_t *y_filter_ptr, const int offset) {
745   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
746   const int32x4_t offset_vec = vdupq_n_s32(offset);
747 
748   if (w == 4) {
749     const int16_t *s = (const int16_t *)src_ptr;
750     uint16_t *d = dst_ptr;
751 
752     int16x4_t s0, s1, s2;
753     load_s16_4x3(s, src_stride, &s0, &s1, &s2);
754     s += 3 * src_stride;
755 
756     do {
757       int16x4_t s3, s4, s5, s6;
758       load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
759 
760       uint16x4_t d0 =
761           highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
762       uint16x4_t d1 =
763           highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
764       uint16x4_t d2 =
765           highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
766       uint16x4_t d3 =
767           highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
768 
769       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
770 
771       s0 = s4;
772       s1 = s5;
773       s2 = s6;
774 
775       s += 4 * src_stride;
776       d += 4 * dst_stride;
777       h -= 4;
778     } while (h != 0);
779   } else {
780     do {
781       int height = h;
782       const int16_t *s = (const int16_t *)src_ptr;
783       uint16_t *d = dst_ptr;
784 
785       int16x8_t s0, s1, s2;
786       load_s16_8x3(s, src_stride, &s0, &s1, &s2);
787       s += 3 * src_stride;
788 
789       do {
790         int16x8_t s3, s4, s5, s6;
791         load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
792 
793         uint16x8_t d0 =
794             highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
795         uint16x8_t d1 =
796             highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
797         uint16x8_t d2 =
798             highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
799         uint16x8_t d3 =
800             highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
801 
802         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
803 
804         s0 = s4;
805         s1 = s5;
806         s2 = s6;
807 
808         s += 4 * src_stride;
809         d += 4 * dst_stride;
810         height -= 4;
811       } while (height != 0);
812       src_ptr += 8;
813       dst_ptr += 8;
814       w -= 8;
815     } while (w != 0);
816   }
817 }
818 
highbd_convolve4_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filter,const int32x4_t offset)819 static inline uint16x4_t highbd_convolve4_4(
820     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
821     const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
822   int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
823   sum = vmlal_lane_s16(sum, s1, filter, 1);
824   sum = vmlal_lane_s16(sum, s2, filter, 2);
825   sum = vmlal_lane_s16(sum, s3, filter, 3);
826 
827   return vqshrun_n_s32(sum, ROUND0_BITS);
828 }
829 
highbd_convolve4_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int32x4_t offset)830 static inline uint16x8_t highbd_convolve4_8(
831     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
832     const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
833   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
834   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
835   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
836   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
837 
838   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
839   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
840   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
841   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
842 
843   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
844                       vqshrun_n_s32(sum1, ROUND0_BITS));
845 }
846 
highbd_dist_wtd_convolve_y_4tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)847 static inline void highbd_dist_wtd_convolve_y_4tap_neon(
848     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
849     int w, int h, const int16_t *y_filter_ptr, const int offset) {
850   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
851   const int32x4_t offset_vec = vdupq_n_s32(offset);
852 
853   if (w == 4) {
854     const int16_t *s = (const int16_t *)src_ptr;
855     uint16_t *d = dst_ptr;
856 
857     int16x4_t s0, s1, s2;
858     load_s16_4x3(s, src_stride, &s0, &s1, &s2);
859     s += 3 * src_stride;
860 
861     do {
862       int16x4_t s3, s4, s5, s6;
863       load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
864 
865       uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
866       uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
867       uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
868       uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
869 
870       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
871 
872       s0 = s4;
873       s1 = s5;
874       s2 = s6;
875 
876       s += 4 * src_stride;
877       d += 4 * dst_stride;
878       h -= 4;
879     } while (h != 0);
880   } else {
881     do {
882       int height = h;
883       const int16_t *s = (const int16_t *)src_ptr;
884       uint16_t *d = dst_ptr;
885 
886       int16x8_t s0, s1, s2;
887       load_s16_8x3(s, src_stride, &s0, &s1, &s2);
888       s += 3 * src_stride;
889 
890       do {
891         int16x8_t s3, s4, s5, s6;
892         load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
893 
894         uint16x8_t d0 =
895             highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
896         uint16x8_t d1 =
897             highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
898         uint16x8_t d2 =
899             highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
900         uint16x8_t d3 =
901             highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
902 
903         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
904 
905         s0 = s4;
906         s1 = s5;
907         s2 = s6;
908 
909         s += 4 * src_stride;
910         d += 4 * dst_stride;
911         height -= 4;
912       } while (height != 0);
913       src_ptr += 8;
914       dst_ptr += 8;
915       w -= 8;
916     } while (w != 0);
917   }
918 }
919 
highbd_12_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)920 static inline void highbd_12_dist_wtd_convolve_y_8tap_neon(
921     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
922     int w, int h, const int16_t *y_filter_ptr, const int offset) {
923   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
924   const int32x4_t offset_vec = vdupq_n_s32(offset);
925 
926   if (w == 4) {
927     const int16_t *s = (const int16_t *)src_ptr;
928     uint16_t *d = dst_ptr;
929 
930     int16x4_t s0, s1, s2, s3, s4, s5, s6;
931     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
932     s += 7 * src_stride;
933 
934     do {
935       int16x4_t s7, s8, s9, s10;
936       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
937 
938       uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
939                                             y_filter, offset_vec);
940       uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
941                                             y_filter, offset_vec);
942       uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
943                                             y_filter, offset_vec);
944       uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
945                                             y_filter, offset_vec);
946 
947       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
948 
949       s0 = s4;
950       s1 = s5;
951       s2 = s6;
952       s3 = s7;
953       s4 = s8;
954       s5 = s9;
955       s6 = s10;
956       s += 4 * src_stride;
957       d += 4 * dst_stride;
958       h -= 4;
959     } while (h != 0);
960   } else {
961     do {
962       int height = h;
963       const int16_t *s = (const int16_t *)src_ptr;
964       uint16_t *d = dst_ptr;
965 
966       int16x8_t s0, s1, s2, s3, s4, s5, s6;
967       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
968       s += 7 * src_stride;
969 
970       do {
971         int16x8_t s7, s8, s9, s10;
972         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
973 
974         uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
975                                               y_filter, offset_vec);
976         uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
977                                               y_filter, offset_vec);
978         uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
979                                               y_filter, offset_vec);
980         uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
981                                               y_filter, offset_vec);
982 
983         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
984 
985         s0 = s4;
986         s1 = s5;
987         s2 = s6;
988         s3 = s7;
989         s4 = s8;
990         s5 = s9;
991         s6 = s10;
992         s += 4 * src_stride;
993         d += 4 * dst_stride;
994         height -= 4;
995       } while (height != 0);
996       src_ptr += 8;
997       dst_ptr += 8;
998       w -= 8;
999     } while (w != 0);
1000   }
1001 }
highbd_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)1002 static inline void highbd_dist_wtd_convolve_y_8tap_neon(
1003     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1004     int w, int h, const int16_t *y_filter_ptr, const int offset) {
1005   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1006   const int32x4_t offset_vec = vdupq_n_s32(offset);
1007 
1008   if (w == 4) {
1009     const int16_t *s = (const int16_t *)src_ptr;
1010     uint16_t *d = dst_ptr;
1011 
1012     int16x4_t s0, s1, s2, s3, s4, s5, s6;
1013     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1014     s += 7 * src_stride;
1015 
1016     do {
1017       int16x4_t s7, s8, s9, s10;
1018       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1019 
1020       uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
1021                                          y_filter, offset_vec);
1022       uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
1023                                          y_filter, offset_vec);
1024       uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
1025                                          y_filter, offset_vec);
1026       uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
1027                                          y_filter, offset_vec);
1028 
1029       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1030 
1031       s0 = s4;
1032       s1 = s5;
1033       s2 = s6;
1034       s3 = s7;
1035       s4 = s8;
1036       s5 = s9;
1037       s6 = s10;
1038       s += 4 * src_stride;
1039       d += 4 * dst_stride;
1040       h -= 4;
1041     } while (h != 0);
1042   } else {
1043     do {
1044       int height = h;
1045       const int16_t *s = (const int16_t *)src_ptr;
1046       uint16_t *d = dst_ptr;
1047 
1048       int16x8_t s0, s1, s2, s3, s4, s5, s6;
1049       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1050       s += 7 * src_stride;
1051 
1052       do {
1053         int16x8_t s7, s8, s9, s10;
1054         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1055 
1056         uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
1057                                            y_filter, offset_vec);
1058         uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
1059                                            y_filter, offset_vec);
1060         uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
1061                                            y_filter, offset_vec);
1062         uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
1063                                            y_filter, offset_vec);
1064 
1065         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1066 
1067         s0 = s4;
1068         s1 = s5;
1069         s2 = s6;
1070         s3 = s7;
1071         s4 = s8;
1072         s5 = s9;
1073         s6 = s10;
1074         s += 4 * src_stride;
1075         d += 4 * dst_stride;
1076         height -= 4;
1077       } while (height != 0);
1078       src_ptr += 8;
1079       dst_ptr += 8;
1080       w -= 8;
1081     } while (w != 0);
1082   }
1083 }
1084 
av1_highbd_dist_wtd_convolve_y_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1085 void av1_highbd_dist_wtd_convolve_y_neon(
1086     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1087     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
1088     ConvolveParams *conv_params, int bd) {
1089   DECLARE_ALIGNED(16, uint16_t,
1090                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1091   CONV_BUF_TYPE *dst16 = conv_params->dst;
1092   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1093   int dst16_stride = conv_params->dst_stride;
1094   const int im_stride = MAX_SB_SIZE;
1095   const int vert_offset = filter_params_y->taps / 2 - 1;
1096   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
1097   const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
1098                                 (1 << (bd + FILTER_BITS)) +
1099                                 (1 << (bd + FILTER_BITS - 1));
1100 
1101   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1102       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1103 
1104   src -= vert_offset * src_stride;
1105 
1106   if (bd == 12) {
1107     if (conv_params->do_average) {
1108       if (y_filter_taps <= 4) {
1109         highbd_12_dist_wtd_convolve_y_4tap_neon(
1110             src + 2 * src_stride, src_stride, im_block, im_stride, w, h,
1111             y_filter_ptr, round_offset_conv);
1112       } else if (y_filter_taps == 6) {
1113         highbd_12_dist_wtd_convolve_y_6tap_neon(
1114             src + src_stride, src_stride, im_block, im_stride, w, h,
1115             y_filter_ptr, round_offset_conv);
1116       } else {
1117         highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
1118                                                 im_stride, w, h, y_filter_ptr,
1119                                                 round_offset_conv);
1120       }
1121       if (conv_params->use_dist_wtd_comp_avg) {
1122         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1123                                          w, h, conv_params);
1124       } else {
1125         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1126                                 conv_params);
1127       }
1128     } else {
1129       if (y_filter_taps <= 4) {
1130         highbd_12_dist_wtd_convolve_y_4tap_neon(
1131             src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h,
1132             y_filter_ptr, round_offset_conv);
1133       } else if (y_filter_taps == 6) {
1134         highbd_12_dist_wtd_convolve_y_6tap_neon(
1135             src + src_stride, src_stride, dst16, dst16_stride, w, h,
1136             y_filter_ptr, round_offset_conv);
1137       } else {
1138         highbd_12_dist_wtd_convolve_y_8tap_neon(
1139             src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1140             round_offset_conv);
1141       }
1142     }
1143   } else {
1144     if (conv_params->do_average) {
1145       if (y_filter_taps <= 4) {
1146         highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
1147                                              im_block, im_stride, w, h,
1148                                              y_filter_ptr, round_offset_conv);
1149       } else if (y_filter_taps == 6) {
1150         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
1151                                              im_block, im_stride, w, h,
1152                                              y_filter_ptr, round_offset_conv);
1153       } else {
1154         highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
1155                                              im_stride, w, h, y_filter_ptr,
1156                                              round_offset_conv);
1157       }
1158       if (conv_params->use_dist_wtd_comp_avg) {
1159         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1160                                       h, conv_params, bd);
1161       } else {
1162         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1163                              conv_params, bd);
1164       }
1165     } else {
1166       if (y_filter_taps <= 4) {
1167         highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
1168                                              dst16, dst16_stride, w, h,
1169                                              y_filter_ptr, round_offset_conv);
1170       } else if (y_filter_taps == 6) {
1171         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
1172                                              dst16, dst16_stride, w, h,
1173                                              y_filter_ptr, round_offset_conv);
1174       } else {
1175         highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
1176                                              dst16_stride, w, h, y_filter_ptr,
1177                                              round_offset_conv);
1178       }
1179     }
1180   }
1181 }
1182 
highbd_2d_copy_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int round_bits,const int offset)1183 static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
1184                                        uint16_t *dst_ptr, int dst_stride, int w,
1185                                        int h, const int round_bits,
1186                                        const int offset) {
1187   if (w <= 4) {
1188     const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
1189     const uint16x4_t offset_u16 = vdup_n_u16(offset);
1190 
1191     for (int y = 0; y < h; ++y) {
1192       const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
1193       uint16x4_t d = vshl_u16(s, round_shift_s16);
1194       d = vadd_u16(d, offset_u16);
1195       if (w == 2) {
1196         store_u16_2x1(dst_ptr + y * dst_stride, d);
1197       } else {
1198         vst1_u16(dst_ptr + y * dst_stride, d);
1199       }
1200     }
1201   } else {
1202     const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
1203     const uint16x8_t offset_u16 = vdupq_n_u16(offset);
1204 
1205     for (int y = 0; y < h; ++y) {
1206       for (int x = 0; x < w; x += 8) {
1207         const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
1208         uint16x8_t d = vshlq_u16(s, round_shift_s16);
1209         d = vaddq_u16(d, offset_u16);
1210         vst1q_u16(dst_ptr + y * dst_stride + x, d);
1211       }
1212     }
1213   }
1214 }
1215 
av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1216 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
1217                                                int src_stride, uint16_t *dst,
1218                                                int dst_stride, int w, int h,
1219                                                ConvolveParams *conv_params,
1220                                                int bd) {
1221   DECLARE_ALIGNED(16, uint16_t,
1222                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1223 
1224   const int im_stride = MAX_SB_SIZE;
1225   CONV_BUF_TYPE *dst16 = conv_params->dst;
1226   int dst16_stride = conv_params->dst_stride;
1227   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1228   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1229                            (1 << (offset_bits - conv_params->round_1 - 1));
1230   const int round_bits =
1231       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1232   assert(round_bits >= 0);
1233 
1234   if (conv_params->do_average) {
1235     highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
1236                         round_offset);
1237   } else {
1238     highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
1239                         round_offset);
1240   }
1241 
1242   if (conv_params->do_average) {
1243     if (conv_params->use_dist_wtd_comp_avg) {
1244       if (bd == 12) {
1245         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1246                                          w, h, conv_params);
1247       } else {
1248         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1249                                       h, conv_params, bd);
1250       }
1251     } else {
1252       if (bd == 12) {
1253         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1254                                 conv_params);
1255       } else {
1256         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1257                              conv_params, bd);
1258       }
1259     }
1260   }
1261 }
1262 
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t offset)1263 static inline uint16x4_t highbd_convolve6_4_2d_v(
1264     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1265     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1266     const int16x8_t y_filter, const int32x4_t offset) {
1267   // Values at indices 0 and 7 of y_filter are zero.
1268   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1269   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1270 
1271   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
1272   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
1273   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
1274   sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
1275   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
1276   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
1277 
1278   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1279 }
1280 
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t offset)1281 static inline uint16x8_t highbd_convolve6_8_2d_v(
1282     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1283     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1284     const int16x8_t y_filter, const int32x4_t offset) {
1285   // Values at indices 0 and 7 of y_filter are zero.
1286   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1287   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1288 
1289   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
1290   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
1291   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
1292   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
1293   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
1294   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
1295 
1296   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
1297   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
1298   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
1299   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
1300   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
1301   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
1302 
1303   return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1304                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1305 }
1306 
highbd_dist_wtd_convolve_2d_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1307 static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1308     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1309     int w, int h, const int16_t *y_filter_ptr, int offset) {
1310   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1311   const int32x4_t offset_vec = vdupq_n_s32(offset);
1312 
1313   if (w == 4) {
1314     const int16_t *s = (const int16_t *)src_ptr;
1315     uint16_t *d = dst_ptr;
1316 
1317     int16x4_t s0, s1, s2, s3, s4;
1318     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1319     s += 5 * src_stride;
1320 
1321     do {
1322       int16x4_t s5, s6, s7, s8;
1323       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
1324 
1325       uint16x4_t d0 =
1326           highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
1327       uint16x4_t d1 =
1328           highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
1329       uint16x4_t d2 =
1330           highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
1331       uint16x4_t d3 =
1332           highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
1333 
1334       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1335 
1336       s0 = s4;
1337       s1 = s5;
1338       s2 = s6;
1339       s3 = s7;
1340       s4 = s8;
1341       s += 4 * src_stride;
1342       d += 4 * dst_stride;
1343       h -= 4;
1344     } while (h != 0);
1345   } else {
1346     do {
1347       int height = h;
1348       const int16_t *s = (const int16_t *)src_ptr;
1349       uint16_t *d = dst_ptr;
1350 
1351       int16x8_t s0, s1, s2, s3, s4;
1352       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1353       s += 5 * src_stride;
1354 
1355       do {
1356         int16x8_t s5, s6, s7, s8;
1357         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
1358 
1359         uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
1360                                                 y_filter, offset_vec);
1361         uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
1362                                                 y_filter, offset_vec);
1363         uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
1364                                                 y_filter, offset_vec);
1365         uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
1366                                                 y_filter, offset_vec);
1367 
1368         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1369 
1370         s0 = s4;
1371         s1 = s5;
1372         s2 = s6;
1373         s3 = s7;
1374         s4 = s8;
1375         s += 4 * src_stride;
1376         d += 4 * dst_stride;
1377         height -= 4;
1378       } while (height != 0);
1379       src_ptr += 8;
1380       dst_ptr += 8;
1381       w -= 8;
1382     } while (w != 0);
1383   }
1384 }
1385 
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t offset)1386 static inline uint16x4_t highbd_convolve8_4_2d_v(
1387     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1388     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1389     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1390     const int32x4_t offset) {
1391   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1392   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1393 
1394   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1395   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1396   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1397   sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1398   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1399   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1400   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1401   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1402 
1403   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1404 }
1405 
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t offset)1406 static inline uint16x8_t highbd_convolve8_8_2d_v(
1407     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1408     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1409     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1410     const int32x4_t offset) {
1411   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1412   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1413 
1414   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1415   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1416   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1417   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1418   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1419   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1420   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1421   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1422 
1423   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1424   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1425   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1426   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1427   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1428   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1429   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1430   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1431 
1432   return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1433                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1434 }
1435 
highbd_dist_wtd_convolve_2d_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1436 static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1437     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1438     int w, int h, const int16_t *y_filter_ptr, int offset) {
1439   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1440   const int32x4_t offset_vec = vdupq_n_s32(offset);
1441 
1442   if (w <= 4) {
1443     const int16_t *s = (const int16_t *)src_ptr;
1444     uint16_t *d = dst_ptr;
1445 
1446     int16x4_t s0, s1, s2, s3, s4, s5, s6;
1447     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1448     s += 7 * src_stride;
1449 
1450     do {
1451       int16x4_t s7, s8, s9, s10;
1452       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1453 
1454       uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1455                                               y_filter, offset_vec);
1456       uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1457                                               y_filter, offset_vec);
1458       uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1459                                               y_filter, offset_vec);
1460       uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1461                                               y_filter, offset_vec);
1462 
1463       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1464 
1465       s0 = s4;
1466       s1 = s5;
1467       s2 = s6;
1468       s3 = s7;
1469       s4 = s8;
1470       s5 = s9;
1471       s6 = s10;
1472       s += 4 * src_stride;
1473       d += 4 * dst_stride;
1474       h -= 4;
1475     } while (h != 0);
1476   } else {
1477     do {
1478       int height = h;
1479       const int16_t *s = (const int16_t *)src_ptr;
1480       uint16_t *d = dst_ptr;
1481 
1482       int16x8_t s0, s1, s2, s3, s4, s5, s6;
1483       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1484       s += 7 * src_stride;
1485 
1486       do {
1487         int16x8_t s7, s8, s9, s10;
1488         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1489 
1490         uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1491                                                 y_filter, offset_vec);
1492         uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1493                                                 y_filter, offset_vec);
1494         uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1495                                                 y_filter, offset_vec);
1496         uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1497                                                 y_filter, offset_vec);
1498 
1499         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1500 
1501         s0 = s4;
1502         s1 = s5;
1503         s2 = s6;
1504         s3 = s7;
1505         s4 = s8;
1506         s5 = s9;
1507         s6 = s10;
1508         s += 4 * src_stride;
1509         d += 4 * dst_stride;
1510         height -= 4;
1511       } while (height != 0);
1512       src_ptr += 8;
1513       dst_ptr += 8;
1514       w -= 8;
1515     } while (w != 0);
1516   }
1517 }
1518 
highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1519 static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1520     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1521     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1522   // The smallest block height is 4, and the horizontal convolution needs to
1523   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1524   assert(h >= 5);
1525   const int32x4_t offset_vec = vdupq_n_s32(offset);
1526 
1527   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1528 
1529   int height = h;
1530 
1531   do {
1532     int width = w;
1533     const int16_t *s = (const int16_t *)src_ptr;
1534     uint16_t *d = dst_ptr;
1535 
1536     do {
1537       int16x8_t s0[6], s1[6], s2[6], s3[6];
1538       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1539                    &s0[4], &s0[5]);
1540       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1541                    &s1[4], &s1[5]);
1542       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1543                    &s2[4], &s2[5]);
1544       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1545                    &s3[4], &s3[5]);
1546 
1547       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1548                                             s0[5], x_filter, offset_vec);
1549       uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1550                                             s1[5], x_filter, offset_vec);
1551       uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1552                                             s2[5], x_filter, offset_vec);
1553       uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1554                                             s3[5], x_filter, offset_vec);
1555 
1556       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1557 
1558       s += 8;
1559       d += 8;
1560       width -= 8;
1561     } while (width != 0);
1562     src_ptr += 4 * src_stride;
1563     dst_ptr += 4 * dst_stride;
1564     height -= 4;
1565   } while (height > 4);
1566 
1567   do {
1568     int width = w;
1569     const int16_t *s = (const int16_t *)src_ptr;
1570     uint16_t *d = dst_ptr;
1571 
1572     do {
1573       int16x8_t s0[6];
1574       load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1575 
1576       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1577                                             s0[5], x_filter, offset_vec);
1578       vst1q_u16(d, d0);
1579 
1580       s += 8;
1581       d += 8;
1582       width -= 8;
1583     } while (width != 0);
1584     src_ptr += src_stride;
1585     dst_ptr += dst_stride;
1586   } while (--height != 0);
1587 }
1588 
highbd_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1589 static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1590     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1591     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1592   // The smallest block height is 4, and the horizontal convolution needs to
1593   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1594   assert(h >= 5);
1595   const int32x4_t offset_vec = vdupq_n_s32(offset);
1596 
1597   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1598 
1599   int height = h;
1600 
1601   do {
1602     int width = w;
1603     const int16_t *s = (const int16_t *)src_ptr;
1604     uint16_t *d = dst_ptr;
1605 
1606     do {
1607       int16x8_t s0[6], s1[6], s2[6], s3[6];
1608       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1609                    &s0[4], &s0[5]);
1610       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1611                    &s1[4], &s1[5]);
1612       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1613                    &s2[4], &s2[5]);
1614       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1615                    &s3[4], &s3[5]);
1616 
1617       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1618                                          s0[5], x_filter, offset_vec);
1619       uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1620                                          s1[5], x_filter, offset_vec);
1621       uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1622                                          s2[5], x_filter, offset_vec);
1623       uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1624                                          s3[5], x_filter, offset_vec);
1625 
1626       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1627 
1628       s += 8;
1629       d += 8;
1630       width -= 8;
1631     } while (width != 0);
1632     src_ptr += 4 * src_stride;
1633     dst_ptr += 4 * dst_stride;
1634     height -= 4;
1635   } while (height > 4);
1636 
1637   do {
1638     int width = w;
1639     const int16_t *s = (const int16_t *)src_ptr;
1640     uint16_t *d = dst_ptr;
1641 
1642     do {
1643       int16x8_t s0[6];
1644       load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1645 
1646       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1647                                          s0[5], x_filter, offset_vec);
1648       vst1q_u16(d, d0);
1649 
1650       s += 8;
1651       d += 8;
1652       width -= 8;
1653     } while (width != 0);
1654     src_ptr += src_stride;
1655     dst_ptr += dst_stride;
1656   } while (--height != 0);
1657 }
1658 
highbd_12_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1659 static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon(
1660     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1661     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1662   // The smallest block height is 4, and the horizontal convolution needs to
1663   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1664   assert(h >= 5);
1665   const int32x4_t offset_vec = vdupq_n_s32(offset);
1666 
1667   if (w == 4) {
1668     // 4-tap filters are used for blocks having width == 4.
1669     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1670     const int16_t *s = (const int16_t *)(src_ptr + 1);
1671     uint16_t *d = dst_ptr;
1672 
1673     do {
1674       int16x4_t s0[4], s1[4], s2[4], s3[4];
1675       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1676       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1677       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1678       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1679 
1680       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1681       uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
1682       uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
1683       uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
1684 
1685       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1686 
1687       s += 4 * src_stride;
1688       d += 4 * dst_stride;
1689       h -= 4;
1690     } while (h > 4);
1691 
1692     do {
1693       int16x4_t s0[4];
1694       load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1695 
1696       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1697       vst1_u16(d, d0);
1698 
1699       s += src_stride;
1700       d += dst_stride;
1701     } while (--h != 0);
1702   } else {
1703     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1704     int height = h;
1705 
1706     do {
1707       int width = w;
1708       const int16_t *s = (const int16_t *)src_ptr;
1709       uint16_t *d = dst_ptr;
1710 
1711       do {
1712         int16x8_t s0[8], s1[8], s2[8], s3[8];
1713         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1714                      &s0[4], &s0[5], &s0[6], &s0[7]);
1715         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1716                      &s1[4], &s1[5], &s1[6], &s1[7]);
1717         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1718                      &s2[4], &s2[5], &s2[6], &s2[7]);
1719         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1720                      &s3[4], &s3[5], &s3[6], &s3[7]);
1721 
1722         uint16x8_t d0 =
1723             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1724                                   s0[6], s0[7], x_filter, offset_vec);
1725         uint16x8_t d1 =
1726             highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
1727                                   s1[6], s1[7], x_filter, offset_vec);
1728         uint16x8_t d2 =
1729             highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
1730                                   s2[6], s2[7], x_filter, offset_vec);
1731         uint16x8_t d3 =
1732             highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
1733                                   s3[6], s3[7], x_filter, offset_vec);
1734 
1735         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1736 
1737         s += 8;
1738         d += 8;
1739         width -= 8;
1740       } while (width != 0);
1741       src_ptr += 4 * src_stride;
1742       dst_ptr += 4 * dst_stride;
1743       height -= 4;
1744     } while (height > 4);
1745 
1746     do {
1747       int width = w;
1748       const int16_t *s = (const int16_t *)src_ptr;
1749       uint16_t *d = dst_ptr;
1750 
1751       do {
1752         int16x8_t s0[8];
1753         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1754                      &s0[4], &s0[5], &s0[6], &s0[7]);
1755 
1756         uint16x8_t d0 =
1757             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1758                                   s0[6], s0[7], x_filter, offset_vec);
1759         vst1q_u16(d, d0);
1760 
1761         s += 8;
1762         d += 8;
1763         width -= 8;
1764       } while (width != 0);
1765       src_ptr += src_stride;
1766       dst_ptr += dst_stride;
1767     } while (--height != 0);
1768   }
1769 }
1770 
highbd_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1771 static inline void highbd_dist_wtd_convolve_2d_horiz_neon(
1772     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1773     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1774   // The smallest block height is 4, and the horizontal convolution needs to
1775   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1776   assert(h >= 5);
1777   const int32x4_t offset_vec = vdupq_n_s32(offset);
1778 
1779   if (w == 4) {
1780     // 4-tap filters are used for blocks having width == 4.
1781     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1782     const int16_t *s = (const int16_t *)(src_ptr + 1);
1783     uint16_t *d = dst_ptr;
1784 
1785     do {
1786       int16x4_t s0[4], s1[4], s2[4], s3[4];
1787       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1788       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1789       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1790       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1791 
1792       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1793       uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
1794       uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
1795       uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
1796 
1797       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1798 
1799       s += 4 * src_stride;
1800       d += 4 * dst_stride;
1801       h -= 4;
1802     } while (h > 4);
1803 
1804     do {
1805       int16x4_t s0[4];
1806       load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1807 
1808       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1809       vst1_u16(d, d0);
1810 
1811       s += src_stride;
1812       d += dst_stride;
1813     } while (--h != 0);
1814   } else {
1815     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1816     int height = h;
1817 
1818     do {
1819       int width = w;
1820       const int16_t *s = (const int16_t *)src_ptr;
1821       uint16_t *d = dst_ptr;
1822 
1823       do {
1824         int16x8_t s0[8], s1[8], s2[8], s3[8];
1825         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1826                      &s0[4], &s0[5], &s0[6], &s0[7]);
1827         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1828                      &s1[4], &s1[5], &s1[6], &s1[7]);
1829         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1830                      &s2[4], &s2[5], &s2[6], &s2[7]);
1831         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1832                      &s3[4], &s3[5], &s3[6], &s3[7]);
1833 
1834         uint16x8_t d0 =
1835             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1836                                s0[7], x_filter, offset_vec);
1837         uint16x8_t d1 =
1838             highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
1839                                s1[7], x_filter, offset_vec);
1840         uint16x8_t d2 =
1841             highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
1842                                s2[7], x_filter, offset_vec);
1843         uint16x8_t d3 =
1844             highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
1845                                s3[7], x_filter, offset_vec);
1846 
1847         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1848 
1849         s += 8;
1850         d += 8;
1851         width -= 8;
1852       } while (width != 0);
1853       src_ptr += 4 * src_stride;
1854       dst_ptr += 4 * dst_stride;
1855       height -= 4;
1856     } while (height > 4);
1857 
1858     do {
1859       int width = w;
1860       const int16_t *s = (const int16_t *)src_ptr;
1861       uint16_t *d = dst_ptr;
1862 
1863       do {
1864         int16x8_t s0[8];
1865         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1866                      &s0[4], &s0[5], &s0[6], &s0[7]);
1867 
1868         uint16x8_t d0 =
1869             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1870                                s0[7], x_filter, offset_vec);
1871         vst1q_u16(d, d0);
1872 
1873         s += 8;
1874         d += 8;
1875         width -= 8;
1876       } while (width != 0);
1877       src_ptr += src_stride;
1878       dst_ptr += dst_stride;
1879     } while (--height != 0);
1880   }
1881 }
1882 
av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1883 void av1_highbd_dist_wtd_convolve_2d_neon(
1884     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1885     int h, const InterpFilterParams *filter_params_x,
1886     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1887     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1888   DECLARE_ALIGNED(16, uint16_t,
1889                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1890   DECLARE_ALIGNED(16, uint16_t,
1891                   im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1892 
1893   CONV_BUF_TYPE *dst16 = conv_params->dst;
1894   int dst16_stride = conv_params->dst_stride;
1895   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1896   const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1897   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1898   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1899 
1900   const int im_h = h + clamped_y_taps - 1;
1901   const int im_stride = MAX_SB_SIZE;
1902   const int vert_offset = clamped_y_taps / 2 - 1;
1903   const int horiz_offset = clamped_x_taps / 2 - 1;
1904   // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
1905   // faster non-rounding non-saturating left shift.
1906   const int round_offset_conv_x =
1907       (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
1908   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1909   const int round_offset_conv_y = (1 << y_offset_bits);
1910 
1911   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1912 
1913   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1914       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1915   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1916       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1917 
1918   // horizontal filter
1919   if (bd == 12) {
1920     if (x_filter_taps <= 6 && w != 4) {
1921       highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1922           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1923           round_offset_conv_x);
1924     } else {
1925       highbd_12_dist_wtd_convolve_2d_horiz_neon(
1926           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1927           round_offset_conv_x);
1928     }
1929   } else {
1930     if (x_filter_taps <= 6 && w != 4) {
1931       highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1932           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1933           round_offset_conv_x);
1934     } else {
1935       highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
1936                                              im_stride, w, im_h, x_filter_ptr,
1937                                              round_offset_conv_x);
1938     }
1939   }
1940 
1941   // vertical filter
1942   if (y_filter_taps <= 6) {
1943     if (conv_params->do_average) {
1944       highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
1945                                                  im_stride, w, h, y_filter_ptr,
1946                                                  round_offset_conv_y);
1947     } else {
1948       highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1949           im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1950           round_offset_conv_y);
1951     }
1952   } else {
1953     if (conv_params->do_average) {
1954       highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
1955                                                  im_stride, w, h, y_filter_ptr,
1956                                                  round_offset_conv_y);
1957     } else {
1958       highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1959           im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1960           round_offset_conv_y);
1961     }
1962   }
1963 
1964   // Do the compound averaging outside the loop, avoids branching within the
1965   // main loop
1966   if (conv_params->do_average) {
1967     if (conv_params->use_dist_wtd_comp_avg) {
1968       if (bd == 12) {
1969         highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
1970                                          w, h, conv_params);
1971       } else {
1972         highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
1973                                       h, conv_params, bd);
1974       }
1975     } else {
1976       if (bd == 12) {
1977         highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1978                                 conv_params);
1979       } else {
1980         highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1981                              conv_params, bd);
1982       }
1983     }
1984   }
1985 }
1986