xref: /aosp_15_r20/external/libaom/av1/common/arm/highbd_convolve_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 
highbd_convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const uint16x4_t max)24 static inline uint16x4_t highbd_convolve6_4_y(
25     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27     const int16x8_t y_filter, const uint16x4_t max) {
28   // Values at indices 0 and 7 of y_filter are zero.
29   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
30   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
31 
32   int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
33   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
34   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
35   sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
36   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
37   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
38 
39   uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
40   return vmin_u16(res, max);
41 }
42 
highbd_convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const uint16x8_t max)43 static inline uint16x8_t highbd_convolve6_8_y(
44     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
45     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
46     const int16x8_t y_filter, const uint16x8_t max) {
47   // Values at indices 0 and 7 of y_filter are zero.
48   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
49   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
50 
51   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
52   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
53   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
54   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
55   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
56   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
57 
58   int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
59   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
60   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
61   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
62   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
63   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
64 
65   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
66                                 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
67   return vminq_u16(res, max);
68 }
69 
highbd_convolve_y_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int bd)70 static inline void highbd_convolve_y_sr_6tap_neon(
71     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
72     int w, int h, const int16_t *y_filter_ptr, const int bd) {
73   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
74 
75   if (w == 4) {
76     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
77     const int16_t *s = (const int16_t *)(src_ptr + src_stride);
78     uint16_t *d = dst_ptr;
79 
80     int16x4_t s0, s1, s2, s3, s4;
81     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
82     s += 5 * src_stride;
83 
84     do {
85       int16x4_t s5, s6, s7, s8;
86       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
87 
88       uint16x4_t d0 =
89           highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
90       uint16x4_t d1 =
91           highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
92       uint16x4_t d2 =
93           highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
94       uint16x4_t d3 =
95           highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
96 
97       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
98 
99       s0 = s4;
100       s1 = s5;
101       s2 = s6;
102       s3 = s7;
103       s4 = s8;
104       s += 4 * src_stride;
105       d += 4 * dst_stride;
106       h -= 4;
107     } while (h != 0);
108   } else {
109     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
110     // Width is a multiple of 8 and height is a multiple of 4.
111     do {
112       int height = h;
113       const int16_t *s = (const int16_t *)(src_ptr + src_stride);
114       uint16_t *d = dst_ptr;
115 
116       int16x8_t s0, s1, s2, s3, s4;
117       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
118       s += 5 * src_stride;
119 
120       do {
121         int16x8_t s5, s6, s7, s8;
122         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
123 
124         uint16x8_t d0 =
125             highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
126         uint16x8_t d1 =
127             highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
128         uint16x8_t d2 =
129             highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
130         uint16x8_t d3 =
131             highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
132 
133         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
134 
135         s0 = s4;
136         s1 = s5;
137         s2 = s6;
138         s3 = s7;
139         s4 = s8;
140         s += 4 * src_stride;
141         d += 4 * dst_stride;
142         height -= 4;
143       } while (height != 0);
144 
145       src_ptr += 8;
146       dst_ptr += 8;
147       w -= 8;
148     } while (w != 0);
149   }
150 }
151 
highbd_convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const uint16x4_t max)152 static inline uint16x4_t highbd_convolve8_4_y(
153     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
154     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
155     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
156     const uint16x4_t max) {
157   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
158   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
159 
160   int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
161   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
162   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
163   sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
164   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
165   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
166   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
167   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
168 
169   uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
170   return vmin_u16(res, max);
171 }
172 
highbd_convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const uint16x8_t max)173 static inline uint16x8_t highbd_convolve8_8_y(
174     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
175     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
176     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
177     const uint16x8_t max) {
178   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
179   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
180 
181   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
182   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
183   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
184   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
185   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
186   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
187   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
188   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
189 
190   int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
191   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
192   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
193   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
194   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
195   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
196   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
197   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
198 
199   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
200                                 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
201   return vminq_u16(res, max);
202 }
203 
highbd_convolve_y_sr_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)204 static inline void highbd_convolve_y_sr_8tap_neon(
205     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
206     int w, int h, const int16_t *y_filter_ptr, int bd) {
207   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
208 
209   if (w == 4) {
210     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
211     const int16_t *s = (const int16_t *)src_ptr;
212     uint16_t *d = dst_ptr;
213 
214     int16x4_t s0, s1, s2, s3, s4, s5, s6;
215     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
216     s += 7 * src_stride;
217 
218     do {
219       int16x4_t s7, s8, s9, s10;
220       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
221 
222       uint16x4_t d0 =
223           highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
224       uint16x4_t d1 =
225           highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
226       uint16x4_t d2 =
227           highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
228       uint16x4_t d3 =
229           highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
230 
231       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
232 
233       s0 = s4;
234       s1 = s5;
235       s2 = s6;
236       s3 = s7;
237       s4 = s8;
238       s5 = s9;
239       s6 = s10;
240       s += 4 * src_stride;
241       d += 4 * dst_stride;
242       h -= 4;
243     } while (h != 0);
244   } else {
245     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
246 
247     do {
248       int height = h;
249       const int16_t *s = (const int16_t *)src_ptr;
250       uint16_t *d = dst_ptr;
251 
252       int16x8_t s0, s1, s2, s3, s4, s5, s6;
253       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
254       s += 7 * src_stride;
255 
256       do {
257         int16x8_t s7, s8, s9, s10;
258         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
259 
260         uint16x8_t d0 =
261             highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
262         uint16x8_t d1 =
263             highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
264         uint16x8_t d2 =
265             highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
266         uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10,
267                                              y_filter, max);
268 
269         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
270 
271         s0 = s4;
272         s1 = s5;
273         s2 = s6;
274         s3 = s7;
275         s4 = s8;
276         s5 = s9;
277         s6 = s10;
278         s += 4 * src_stride;
279         d += 4 * dst_stride;
280         height -= 4;
281       } while (height != 0);
282       src_ptr += 8;
283       dst_ptr += 8;
284       w -= 8;
285     } while (w != 0);
286   }
287 }
288 
highbd_convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const uint16x4_t max)289 static inline uint16x4_t highbd_convolve12_4_y(
290     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
291     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
292     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
293     const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
294     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
295     const uint16x4_t max) {
296   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
297   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
298 
299   int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
300   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
301   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
302   sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
303   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
304   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
305   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
306   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
307   sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
308   sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
309   sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
310   sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
311 
312   uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
313   return vmin_u16(res, max);
314 }
315 
highbd_convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const uint16x8_t max)316 static inline uint16x8_t highbd_convolve12_8_y(
317     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
318     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
319     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
320     const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
321     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
322     const uint16x8_t max) {
323   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
324   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
325 
326   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
327   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
328   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
329   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
330   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
331   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
332   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
333   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
334   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
335   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
336   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
337   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
338 
339   int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
340   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
341   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
342   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
343   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
344   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
345   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
346   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
347   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
348   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
349   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
350   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
351 
352   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
353                                 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
354   return vminq_u16(res, max);
355 }
356 
highbd_convolve_y_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)357 static inline void highbd_convolve_y_sr_12tap_neon(
358     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
359     int w, int h, const int16_t *y_filter_ptr, int bd) {
360   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
361   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
362 
363   if (w == 4) {
364     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
365     const int16_t *s = (const int16_t *)src_ptr;
366     uint16_t *d = dst_ptr;
367 
368     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
369     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
370                   &s9, &s10);
371     s += 11 * src_stride;
372 
373     do {
374       int16x4_t s11, s12, s13, s14;
375       load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
376 
377       uint16x4_t d0 =
378           highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
379                                 s11, y_filter_0_7, y_filter_8_11, max);
380       uint16x4_t d1 =
381           highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
382                                 s12, y_filter_0_7, y_filter_8_11, max);
383       uint16x4_t d2 =
384           highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
385                                 s13, y_filter_0_7, y_filter_8_11, max);
386       uint16x4_t d3 =
387           highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
388                                 s14, y_filter_0_7, y_filter_8_11, max);
389 
390       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
391 
392       s0 = s4;
393       s1 = s5;
394       s2 = s6;
395       s3 = s7;
396       s4 = s8;
397       s5 = s9;
398       s6 = s10;
399       s7 = s11;
400       s8 = s12;
401       s9 = s13;
402       s10 = s14;
403       s += 4 * src_stride;
404       d += 4 * dst_stride;
405       h -= 4;
406     } while (h != 0);
407   } else {
408     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
409 
410     do {
411       int height = h;
412       const int16_t *s = (const int16_t *)src_ptr;
413       uint16_t *d = dst_ptr;
414 
415       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
416       load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
417                     &s9, &s10);
418       s += 11 * src_stride;
419 
420       do {
421         int16x8_t s11, s12, s13, s14;
422         load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
423 
424         uint16x8_t d0 =
425             highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
426                                   s11, y_filter_0_7, y_filter_8_11, max);
427         uint16x8_t d1 =
428             highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
429                                   s12, y_filter_0_7, y_filter_8_11, max);
430         uint16x8_t d2 =
431             highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
432                                   s13, y_filter_0_7, y_filter_8_11, max);
433         uint16x8_t d3 =
434             highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
435                                   s13, s14, y_filter_0_7, y_filter_8_11, max);
436 
437         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
438 
439         s0 = s4;
440         s1 = s5;
441         s2 = s6;
442         s3 = s7;
443         s4 = s8;
444         s5 = s9;
445         s6 = s10;
446         s7 = s11;
447         s8 = s12;
448         s9 = s13;
449         s10 = s14;
450         s += 4 * src_stride;
451         d += 4 * dst_stride;
452         height -= 4;
453       } while (height != 0);
454 
455       src_ptr += 8;
456       dst_ptr += 8;
457       w -= 8;
458     } while (w != 0);
459   }
460 }
461 
av1_highbd_convolve_y_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)462 void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
463                                    uint16_t *dst, int dst_stride, int w, int h,
464                                    const InterpFilterParams *filter_params_y,
465                                    const int subpel_y_qn, int bd) {
466   if (w == 2 || h == 2) {
467     av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
468                                filter_params_y, subpel_y_qn, bd);
469     return;
470   }
471   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
472   const int vert_offset = filter_params_y->taps / 2 - 1;
473   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
474       filter_params_y, subpel_y_qn & SUBPEL_MASK);
475 
476   src -= vert_offset * src_stride;
477 
478   if (y_filter_taps > 8) {
479     highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
480                                     y_filter_ptr, bd);
481     return;
482   }
483   if (y_filter_taps < 8) {
484     highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h,
485                                    y_filter_ptr, bd);
486     return;
487   }
488 
489   highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
490                                  y_filter_ptr, bd);
491 }
492 
highbd_convolve6_8_x(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t offset,const uint16x8_t max)493 static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
494                                               const int16x8_t x_filter,
495                                               const int32x4_t offset,
496                                               const uint16x8_t max) {
497   // Values at indices 0 and 7 of y_filter are zero.
498   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
499   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
500 
501   int32x4_t sum0 = offset;
502   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
503   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
504   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
505   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
506   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
507   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
508 
509   int32x4_t sum1 = offset;
510   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
511   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
512   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
513   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
514   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
515   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
516 
517   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
518                                 vqrshrun_n_s32(sum1, FILTER_BITS));
519   return vminq_u16(res, max);
520 }
521 
highbd_convolve_x_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)522 static inline void highbd_convolve_x_sr_6tap_neon(
523     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
524     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
525     int bd) {
526   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
527   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
528   // This shim allows to do only one rounding shift instead of two.
529   const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
530 
531   int height = h;
532 
533   do {
534     int width = w;
535     const int16_t *s = (const int16_t *)src_ptr;
536     uint16_t *d = dst_ptr;
537 
538     do {
539       int16x8_t s0[6], s1[6], s2[6], s3[6];
540       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
541                    &s0[4], &s0[5]);
542       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
543                    &s1[4], &s1[5]);
544       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
545                    &s2[4], &s2[5]);
546       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
547                    &s3[4], &s3[5]);
548 
549       uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max);
550       uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max);
551       uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max);
552       uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max);
553 
554       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
555 
556       s += 8;
557       d += 8;
558       width -= 8;
559     } while (width != 0);
560 
561     src_ptr += 4 * src_stride;
562     dst_ptr += 4 * dst_stride;
563     height -= 4;
564   } while (height != 0);
565 }
566 
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset,const uint16x4_t max)567 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
568                                               const int16x4_t x_filter,
569                                               const int32x4_t offset,
570                                               const uint16x4_t max) {
571   int32x4_t sum = offset;
572   sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
573   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
574   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
575   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
576 
577   uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
578   return vmin_u16(res, max);
579 }
580 
highbd_convolve8_8_x(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t offset,const uint16x8_t max)581 static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
582                                               const int16x8_t x_filter,
583                                               const int32x4_t offset,
584                                               const uint16x8_t max) {
585   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
586   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
587 
588   int32x4_t sum0 = offset;
589   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
590   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
591   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
592   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
593   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
594   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
595   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
596   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
597 
598   int32x4_t sum1 = offset;
599   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
600   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
601   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
602   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
603   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
604   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
605   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
606   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
607 
608   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
609                                 vqrshrun_n_s32(sum1, FILTER_BITS));
610   return vminq_u16(res, max);
611 }
612 
highbd_convolve_x_sr_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)613 static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
614                                              int src_stride, uint16_t *dst_ptr,
615                                              int dst_stride, int w, int h,
616                                              const int16_t *x_filter_ptr,
617                                              ConvolveParams *conv_params,
618                                              int bd) {
619   // This shim allows to do only one rounding shift instead of two.
620   const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
621 
622   if (w == 4) {
623     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
624     // 4-tap filters are used for blocks having width == 4.
625     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
626     const int16_t *s = (const int16_t *)(src_ptr + 2);
627     uint16_t *d = dst_ptr;
628 
629     do {
630       int16x4_t s0[4], s1[4], s2[4], s3[4];
631       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
632       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
633       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
634       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
635 
636       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max);
637       uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max);
638       uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max);
639       uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max);
640 
641       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
642 
643       s += 4 * src_stride;
644       d += 4 * dst_stride;
645       h -= 4;
646     } while (h != 0);
647   } else {
648     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
649     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
650     int height = h;
651 
652     do {
653       int width = w;
654       const int16_t *s = (const int16_t *)src_ptr;
655       uint16_t *d = dst_ptr;
656 
657       do {
658         int16x8_t s0[8], s1[8], s2[8], s3[8];
659         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
660                      &s0[4], &s0[5], &s0[6], &s0[7]);
661         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
662                      &s1[4], &s1[5], &s1[6], &s1[7]);
663         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
664                      &s2[4], &s2[5], &s2[6], &s2[7]);
665         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
666                      &s3[4], &s3[5], &s3[6], &s3[7]);
667 
668         uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max);
669         uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max);
670         uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max);
671         uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max);
672 
673         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
674 
675         s += 8;
676         d += 8;
677         width -= 8;
678       } while (width != 0);
679       src_ptr += 4 * src_stride;
680       dst_ptr += 4 * dst_stride;
681       height -= 4;
682     } while (height != 0);
683   }
684 }
685 
highbd_convolve12_4_x(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset,const uint16x4_t max)686 static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
687                                                const int16x8_t x_filter_0_7,
688                                                const int16x4_t x_filter_8_11,
689                                                const int32x4_t offset,
690                                                const uint16x4_t max) {
691   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
692   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
693 
694   int32x4_t sum = offset;
695   sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
696   sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
697   sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
698   sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
699   sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
700   sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
701   sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
702   sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
703   sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
704   sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
705   sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
706   sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
707 
708   uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
709   return vmin_u16(res, max);
710 }
711 
highbd_convolve12_8_x(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset,const uint16x8_t max)712 static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
713                                                const int16x8_t x_filter_0_7,
714                                                const int16x4_t x_filter_8_11,
715                                                const int32x4_t offset,
716                                                const uint16x8_t max) {
717   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
718   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
719 
720   int32x4_t sum0 = offset;
721   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
722   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
723   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
724   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
725   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
726   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
727   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
728   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
729   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
730   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
731   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
732   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
733 
734   int32x4_t sum1 = offset;
735   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
736   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
737   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
738   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
739   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
740   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
741   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
742   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
743   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
744   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
745   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
746   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
747 
748   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
749                                 vqrshrun_n_s32(sum1, FILTER_BITS));
750   return vminq_u16(res, max);
751 }
752 
highbd_convolve_x_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)753 static inline void highbd_convolve_x_sr_12tap_neon(
754     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
755     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
756     int bd) {
757   // This shim allows to do only one rounding shift instead of two.
758   const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
759   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
760   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
761 
762   if (w == 4) {
763     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
764     const int16_t *s = (const int16_t *)src_ptr;
765     uint16_t *d = dst_ptr;
766 
767     do {
768       int16x4_t s0[12], s1[12], s2[12], s3[12];
769       load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
770                     &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
771                     &s0[11]);
772       load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
773                     &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
774                     &s1[11]);
775       load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
776                     &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
777                     &s2[11]);
778       load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
779                     &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
780                     &s3[11]);
781 
782       uint16x4_t d0 =
783           highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
784       uint16x4_t d1 =
785           highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
786       uint16x4_t d2 =
787           highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
788       uint16x4_t d3 =
789           highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
790 
791       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
792 
793       s += 4 * src_stride;
794       d += 4 * dst_stride;
795       h -= 4;
796     } while (h != 0);
797   } else {
798     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
799     int height = h;
800 
801     do {
802       int width = w;
803       const int16_t *s = (const int16_t *)src_ptr;
804       uint16_t *d = dst_ptr;
805 
806       do {
807         int16x8_t s0[12], s1[12], s2[12], s3[12];
808         load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
809                       &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
810                       &s0[11]);
811         load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
812                       &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
813                       &s1[11]);
814         load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
815                       &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
816                       &s2[11]);
817         load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
818                       &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
819                       &s3[11]);
820 
821         uint16x8_t d0 =
822             highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
823         uint16x8_t d1 =
824             highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
825         uint16x8_t d2 =
826             highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
827         uint16x8_t d3 =
828             highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
829 
830         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
831 
832         s += 8;
833         d += 8;
834         width -= 8;
835       } while (width != 0);
836       src_ptr += 4 * src_stride;
837       dst_ptr += 4 * dst_stride;
838       height -= 4;
839     } while (height != 0);
840   }
841 }
842 
av1_highbd_convolve_x_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)843 void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
844                                    uint16_t *dst, int dst_stride, int w, int h,
845                                    const InterpFilterParams *filter_params_x,
846                                    const int subpel_x_qn,
847                                    ConvolveParams *conv_params, int bd) {
848   if (w == 2 || h == 2) {
849     av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
850                                filter_params_x, subpel_x_qn, conv_params, bd);
851     return;
852   }
853   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
854   const int horiz_offset = filter_params_x->taps / 2 - 1;
855   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
856       filter_params_x, subpel_x_qn & SUBPEL_MASK);
857 
858   src -= horiz_offset;
859 
860   if (x_filter_taps > 8) {
861     highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
862                                     x_filter_ptr, conv_params, bd);
863     return;
864   }
865   if (x_filter_taps <= 6 && w != 4) {
866     highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
867                                    x_filter_ptr, conv_params, bd);
868     return;
869   }
870 
871   highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
872                             x_filter_ptr, conv_params, bd);
873 }
874 
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)875 static inline uint16x4_t highbd_convolve6_4_2d_v(
876     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
877     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
878     const int16x8_t y_filter, const int32x4_t round_shift,
879     const int32x4_t offset, const uint16x4_t max) {
880   // Values at indices 0 and 7 of y_filter are zero.
881   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
882   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
883 
884   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
885   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
886   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
887   sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
888   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
889   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
890 
891   sum = vshlq_s32(sum, round_shift);
892   uint16x4_t res = vqmovun_s32(sum);
893   return vmin_u16(res, max);
894 }
895 
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)896 static inline uint16x8_t highbd_convolve6_8_2d_v(
897     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
898     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
899     const int16x8_t y_filter, const int32x4_t round_shift,
900     const int32x4_t offset, const uint16x8_t max) {
901   // Values at indices 0 and 7 of y_filter are zero.
902   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
903   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
904 
905   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
906   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
907   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
908   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
909   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
910   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
911 
912   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
913   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
914   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
915   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
916   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
917   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
918 
919   sum0 = vshlq_s32(sum0, round_shift);
920   sum1 = vshlq_s32(sum1, round_shift);
921 
922   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
923   return vminq_u16(res, max);
924 }
925 
highbd_convolve_2d_sr_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)926 static inline void highbd_convolve_2d_sr_vert_6tap_neon(
927     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
928     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
929     int bd, const int offset) {
930   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
931   const int32x4_t offset_s32 = vdupq_n_s32(offset);
932   const int round1_shift = conv_params->round_1;
933   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
934 
935   if (w == 4) {
936     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
937     const int16_t *s = (const int16_t *)src_ptr;
938     uint16_t *d = dst_ptr;
939     int16x4_t s0, s1, s2, s3, s4;
940     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
941     s += 5 * src_stride;
942 
943     do {
944       int16x4_t s5, s6, s7, s8;
945       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
946 
947       uint16x4_t d0 = highbd_convolve6_4_2d_v(
948           s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max);
949       uint16x4_t d1 = highbd_convolve6_4_2d_v(
950           s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max);
951       uint16x4_t d2 = highbd_convolve6_4_2d_v(
952           s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max);
953       uint16x4_t d3 = highbd_convolve6_4_2d_v(
954           s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max);
955 
956       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
957 
958       s0 = s4;
959       s1 = s5;
960       s2 = s6;
961       s3 = s7;
962       s4 = s8;
963       s += 4 * src_stride;
964       d += 4 * dst_stride;
965       h -= 4;
966     } while (h != 0);
967   } else {
968     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
969 
970     do {
971       int height = h;
972       const int16_t *s = (const int16_t *)src_ptr;
973       uint16_t *d = dst_ptr;
974       int16x8_t s0, s1, s2, s3, s4;
975       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
976       s += 5 * src_stride;
977 
978       do {
979         int16x8_t s5, s6, s7, s8;
980         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
981 
982         uint16x8_t d0 =
983             highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
984                                     round1_shift_s32, offset_s32, max);
985         uint16x8_t d1 =
986             highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
987                                     round1_shift_s32, offset_s32, max);
988         uint16x8_t d2 =
989             highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
990                                     round1_shift_s32, offset_s32, max);
991         uint16x8_t d3 =
992             highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
993                                     round1_shift_s32, offset_s32, max);
994 
995         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
996 
997         s0 = s4;
998         s1 = s5;
999         s2 = s6;
1000         s3 = s7;
1001         s4 = s8;
1002         s += 4 * src_stride;
1003         d += 4 * dst_stride;
1004         height -= 4;
1005       } while (height != 0);
1006       src_ptr += 8;
1007       dst_ptr += 8;
1008       w -= 8;
1009     } while (w != 0);
1010   }
1011 }
1012 
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)1013 static inline uint16x4_t highbd_convolve8_4_2d_v(
1014     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1015     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1016     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1017     const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
1018   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1019   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1020 
1021   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
1022   sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
1023   sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
1024   sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
1025   sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
1026   sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
1027   sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
1028   sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
1029 
1030   sum = vshlq_s32(sum, round_shift);
1031   uint16x4_t res = vqmovun_s32(sum);
1032   return vmin_u16(res, max);
1033 }
1034 
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)1035 static inline uint16x8_t highbd_convolve8_8_2d_v(
1036     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1037     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1038     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1039     const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
1040   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1041   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1042 
1043   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
1044   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
1045   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
1046   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
1047   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
1048   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
1049   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
1050   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
1051 
1052   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
1053   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
1054   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
1055   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
1056   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
1057   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
1058   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
1059   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
1060 
1061   sum0 = vshlq_s32(sum0, round_shift);
1062   sum1 = vshlq_s32(sum1, round_shift);
1063 
1064   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1065   return vminq_u16(res, max);
1066 }
1067 
highbd_convolve_2d_sr_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)1068 static inline void highbd_convolve_2d_sr_vert_8tap_neon(
1069     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1070     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1071     int bd, const int offset) {
1072   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1073   const int32x4_t offset_s32 = vdupq_n_s32(offset);
1074   const int round1_shift = conv_params->round_1;
1075   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1076 
1077   if (w == 4) {
1078     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1079     const int16_t *s = (const int16_t *)src_ptr;
1080     uint16_t *d = dst_ptr;
1081 
1082     int16x4_t s0, s1, s2, s3, s4, s5, s6;
1083     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1084     s += 7 * src_stride;
1085 
1086     do {
1087       int16x4_t s7, s8, s9, s10;
1088       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1089 
1090       uint16x4_t d0 =
1091           highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1092                                   round1_shift_s32, offset_s32, max);
1093       uint16x4_t d1 =
1094           highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1095                                   round1_shift_s32, offset_s32, max);
1096       uint16x4_t d2 =
1097           highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1098                                   round1_shift_s32, offset_s32, max);
1099       uint16x4_t d3 =
1100           highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1101                                   round1_shift_s32, offset_s32, max);
1102 
1103       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1104 
1105       s0 = s4;
1106       s1 = s5;
1107       s2 = s6;
1108       s3 = s7;
1109       s4 = s8;
1110       s5 = s9;
1111       s6 = s10;
1112       s += 4 * src_stride;
1113       d += 4 * dst_stride;
1114       h -= 4;
1115     } while (h != 0);
1116   } else {
1117     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1118 
1119     do {
1120       int height = h;
1121       const int16_t *s = (const int16_t *)src_ptr;
1122       uint16_t *d = dst_ptr;
1123 
1124       int16x8_t s0, s1, s2, s3, s4, s5, s6;
1125       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1126       s += 7 * src_stride;
1127 
1128       do {
1129         int16x8_t s7, s8, s9, s10;
1130         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1131 
1132         uint16x8_t d0 =
1133             highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1134                                     round1_shift_s32, offset_s32, max);
1135         uint16x8_t d1 =
1136             highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1137                                     round1_shift_s32, offset_s32, max);
1138         uint16x8_t d2 =
1139             highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1140                                     round1_shift_s32, offset_s32, max);
1141         uint16x8_t d3 =
1142             highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1143                                     round1_shift_s32, offset_s32, max);
1144 
1145         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1146 
1147         s0 = s4;
1148         s1 = s5;
1149         s2 = s6;
1150         s3 = s7;
1151         s4 = s8;
1152         s5 = s9;
1153         s6 = s10;
1154         s += 4 * src_stride;
1155         d += 4 * dst_stride;
1156         height -= 4;
1157       } while (height != 0);
1158       src_ptr += 8;
1159       dst_ptr += 8;
1160       w -= 8;
1161     } while (w != 0);
1162   }
1163 }
1164 
highbd_convolve12_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)1165 static inline uint16x4_t highbd_convolve12_4_2d_v(
1166     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1167     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1168     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1169     const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1170     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1171     const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
1172   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1173   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1174 
1175   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1176   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1177   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1178   sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1179   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1180   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1181   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1182   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1183   sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
1184   sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
1185   sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
1186   sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
1187 
1188   sum = vshlq_s32(sum, round_shift);
1189   uint16x4_t res = vqmovun_s32(sum);
1190   return vmin_u16(res, max);
1191 }
1192 
highbd_convolve12_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)1193 static inline uint16x8_t highbd_convolve12_8_2d_v(
1194     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1195     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1196     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
1197     const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
1198     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1199     const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
1200   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1201   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1202 
1203   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1204   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1205   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1206   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1207   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1208   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1209   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1210   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1211   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
1212   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
1213   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
1214   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
1215 
1216   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1217   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1218   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1219   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1220   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1221   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1222   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1223   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1224   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
1225   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
1226   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
1227   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
1228 
1229   sum0 = vshlq_s32(sum0, round_shift);
1230   sum1 = vshlq_s32(sum1, round_shift);
1231 
1232   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1233   return vminq_u16(res, max);
1234 }
1235 
highbd_convolve_2d_sr_vert_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int bd,const int offset)1236 static inline void highbd_convolve_2d_sr_vert_12tap_neon(
1237     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1238     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1239     const int bd, const int offset) {
1240   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1241   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1242   const int32x4_t offset_s32 = vdupq_n_s32(offset);
1243   const int round1_shift = conv_params->round_1;
1244   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1245 
1246   if (w == 4) {
1247     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1248     const int16_t *s = (const int16_t *)src_ptr;
1249     uint16_t *d = dst_ptr;
1250 
1251     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1252     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1253                   &s9, &s10);
1254     s += 11 * src_stride;
1255 
1256     do {
1257       int16x4_t s11, s12, s13, s14;
1258       load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
1259 
1260       uint16x4_t d0 = highbd_convolve12_4_2d_v(
1261           s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1262           y_filter_8_11, round1_shift_s32, offset_s32, max);
1263       uint16x4_t d1 = highbd_convolve12_4_2d_v(
1264           s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1265           y_filter_8_11, round1_shift_s32, offset_s32, max);
1266       uint16x4_t d2 = highbd_convolve12_4_2d_v(
1267           s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1268           y_filter_8_11, round1_shift_s32, offset_s32, max);
1269       uint16x4_t d3 = highbd_convolve12_4_2d_v(
1270           s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1271           y_filter_8_11, round1_shift_s32, offset_s32, max);
1272 
1273       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1274 
1275       s0 = s4;
1276       s1 = s5;
1277       s2 = s6;
1278       s3 = s7;
1279       s4 = s8;
1280       s5 = s9;
1281       s6 = s10;
1282       s7 = s11;
1283       s8 = s12;
1284       s9 = s13;
1285       s10 = s14;
1286       s += 4 * src_stride;
1287       d += 4 * dst_stride;
1288       h -= 4;
1289     } while (h != 0);
1290   } else {
1291     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1292 
1293     do {
1294       int height = h;
1295       const int16_t *s = (const int16_t *)src_ptr;
1296       uint16_t *d = dst_ptr;
1297 
1298       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1299       load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1300                     &s9, &s10);
1301       s += 11 * src_stride;
1302 
1303       do {
1304         int16x8_t s11, s12, s13, s14;
1305         load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
1306 
1307         uint16x8_t d0 = highbd_convolve12_8_2d_v(
1308             s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1309             y_filter_8_11, round1_shift_s32, offset_s32, max);
1310         uint16x8_t d1 = highbd_convolve12_8_2d_v(
1311             s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1312             y_filter_8_11, round1_shift_s32, offset_s32, max);
1313         uint16x8_t d2 = highbd_convolve12_8_2d_v(
1314             s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1315             y_filter_8_11, round1_shift_s32, offset_s32, max);
1316         uint16x8_t d3 = highbd_convolve12_8_2d_v(
1317             s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1318             y_filter_8_11, round1_shift_s32, offset_s32, max);
1319 
1320         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1321 
1322         s0 = s4;
1323         s1 = s5;
1324         s2 = s6;
1325         s3 = s7;
1326         s4 = s8;
1327         s5 = s9;
1328         s6 = s10;
1329         s7 = s11;
1330         s8 = s12;
1331         s9 = s13;
1332         s10 = s14;
1333         s += 4 * src_stride;
1334         d += 4 * dst_stride;
1335         height -= 4;
1336       } while (height != 0);
1337 
1338       src_ptr += 8;
1339       dst_ptr += 8;
1340       w -= 8;
1341     } while (w != 0);
1342   }
1343 }
1344 
highbd_convolve6_8_2d_h(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1345 static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
1346                                                  const int16x8_t x_filter,
1347                                                  const int32x4_t shift_s32,
1348                                                  const int32x4_t offset) {
1349   // Values at indices 0 and 7 of y_filter are zero.
1350   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1351   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1352 
1353   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
1354   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
1355   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
1356   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
1357   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
1358   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
1359 
1360   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
1361   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
1362   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
1363   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
1364   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
1365   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
1366 
1367   sum0 = vqrshlq_s32(sum0, shift_s32);
1368   sum1 = vqrshlq_s32(sum1, shift_s32);
1369 
1370   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1371 }
1372 
highbd_convolve_2d_sr_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1373 static inline void highbd_convolve_2d_sr_horiz_6tap_neon(
1374     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1375     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1376     const int offset) {
1377   // The smallest block height processed by the SIMD functions is 4, and the
1378   // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1379   // for the vertical convolution.
1380   assert(h >= 5);
1381   const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1382   const int32x4_t offset_s32 = vdupq_n_s32(offset);
1383 
1384   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1385   int height = h;
1386 
1387   do {
1388     int width = w;
1389     const int16_t *s = (const int16_t *)src_ptr;
1390     uint16_t *d = dst_ptr;
1391 
1392     do {
1393       int16x8_t s0[6], s1[6], s2[6], s3[6];
1394       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1395                    &s0[4], &s0[5]);
1396       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1397                    &s1[4], &s1[5]);
1398       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1399                    &s2[4], &s2[5]);
1400       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1401                    &s3[4], &s3[5]);
1402 
1403       uint16x8_t d0 =
1404           highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1405       uint16x8_t d1 =
1406           highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1407       uint16x8_t d2 =
1408           highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1409       uint16x8_t d3 =
1410           highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1411 
1412       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1413 
1414       s += 8;
1415       d += 8;
1416       width -= 8;
1417     } while (width != 0);
1418     src_ptr += 4 * src_stride;
1419     dst_ptr += 4 * dst_stride;
1420     height -= 4;
1421   } while (height > 4);
1422   do {
1423     int width = w;
1424     const int16_t *s = (const int16_t *)src_ptr;
1425     uint16_t *d = dst_ptr;
1426 
1427     do {
1428       int16x8_t s0[6];
1429       load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1430 
1431       uint16x8_t d0 =
1432           highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1433       vst1q_u16(d, d0);
1434 
1435       s += 8;
1436       d += 8;
1437       width -= 8;
1438     } while (width != 0);
1439     src_ptr += src_stride;
1440     dst_ptr += dst_stride;
1441   } while (--height != 0);
1442 }
1443 
highbd_convolve4_4_2d_h(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1444 static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
1445                                                  const int16x4_t x_filter,
1446                                                  const int32x4_t shift_s32,
1447                                                  const int32x4_t offset) {
1448   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
1449   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
1450   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
1451   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
1452 
1453   sum = vqrshlq_s32(sum, shift_s32);
1454   return vqmovun_s32(sum);
1455 }
1456 
highbd_convolve8_8_2d_h(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1457 static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
1458                                                  const int16x8_t x_filter,
1459                                                  const int32x4_t shift_s32,
1460                                                  const int32x4_t offset) {
1461   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1462   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1463 
1464   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1465   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1466   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1467   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1468   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1469   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1470   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1471   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1472 
1473   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1474   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1475   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1476   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1477   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1478   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1479   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1480   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1481 
1482   sum0 = vqrshlq_s32(sum0, shift_s32);
1483   sum1 = vqrshlq_s32(sum1, shift_s32);
1484 
1485   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1486 }
1487 
highbd_convolve_2d_sr_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1488 static inline void highbd_convolve_2d_sr_horiz_neon(
1489     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1490     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1491     const int offset) {
1492   // The smallest block height processed by the SIMD functions is 4, and the
1493   // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1494   // for the vertical convolution.
1495   assert(h >= 5);
1496   const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1497   const int32x4_t offset_s32 = vdupq_n_s32(offset);
1498 
1499   if (w == 4) {
1500     // 4-tap filters are used for blocks having width <= 4.
1501     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1502     const int16_t *s = (const int16_t *)(src_ptr + 1);
1503     uint16_t *d = dst_ptr;
1504 
1505     do {
1506       int16x4_t s0[4], s1[4], s2[4], s3[4];
1507       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1508       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1509       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1510       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1511 
1512       uint16x4_t d0 =
1513           highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1514       uint16x4_t d1 =
1515           highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
1516       uint16x4_t d2 =
1517           highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
1518       uint16x4_t d3 =
1519           highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
1520 
1521       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1522 
1523       s += 4 * src_stride;
1524       d += 4 * dst_stride;
1525       h -= 4;
1526     } while (h > 4);
1527 
1528     do {
1529       int16x4_t s0[4];
1530       load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1531 
1532       uint16x4_t d0 =
1533           highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1534 
1535       vst1_u16(d, d0);
1536 
1537       s += src_stride;
1538       d += dst_stride;
1539     } while (--h != 0);
1540   } else {
1541     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1542     int height = h;
1543 
1544     do {
1545       int width = w;
1546       const int16_t *s = (const int16_t *)src_ptr;
1547       uint16_t *d = dst_ptr;
1548 
1549       do {
1550         int16x8_t s0[8], s1[8], s2[8], s3[8];
1551         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1552                      &s0[4], &s0[5], &s0[6], &s0[7]);
1553         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1554                      &s1[4], &s1[5], &s1[6], &s1[7]);
1555         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1556                      &s2[4], &s2[5], &s2[6], &s2[7]);
1557         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1558                      &s3[4], &s3[5], &s3[6], &s3[7]);
1559 
1560         uint16x8_t d0 =
1561             highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1562         uint16x8_t d1 =
1563             highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1564         uint16x8_t d2 =
1565             highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1566         uint16x8_t d3 =
1567             highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1568 
1569         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1570 
1571         s += 8;
1572         d += 8;
1573         width -= 8;
1574       } while (width != 0);
1575       src_ptr += 4 * src_stride;
1576       dst_ptr += 4 * dst_stride;
1577       height -= 4;
1578     } while (height > 4);
1579 
1580     do {
1581       int width = w;
1582       const int16_t *s = (const int16_t *)src_ptr;
1583       uint16_t *d = dst_ptr;
1584 
1585       do {
1586         int16x8_t s0[8];
1587         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1588                      &s0[4], &s0[5], &s0[6], &s0[7]);
1589 
1590         uint16x8_t d0 =
1591             highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1592         vst1q_u16(d, d0);
1593 
1594         s += 8;
1595         d += 8;
1596         width -= 8;
1597       } while (width != 0);
1598       src_ptr += src_stride;
1599       dst_ptr += dst_stride;
1600     } while (--height != 0);
1601   }
1602 }
1603 
highbd_convolve12_4_2d_h(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1604 static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
1605                                                   const int16x8_t x_filter_0_7,
1606                                                   const int16x4_t x_filter_8_11,
1607                                                   const int32x4_t shift_s32,
1608                                                   const int32x4_t offset) {
1609   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1610   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1611 
1612   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
1613   sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
1614   sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
1615   sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
1616   sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
1617   sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
1618   sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
1619   sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
1620   sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
1621   sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
1622   sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
1623   sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
1624 
1625   sum = vqrshlq_s32(sum, shift_s32);
1626   return vqmovun_s32(sum);
1627 }
1628 
highbd_convolve12_8_2d_h(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1629 static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
1630                                                   const int16x8_t x_filter_0_7,
1631                                                   const int16x4_t x_filter_8_11,
1632                                                   const int32x4_t shift_s32,
1633                                                   const int32x4_t offset) {
1634   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1635   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1636 
1637   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1638   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1639   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1640   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1641   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1642   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1643   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1644   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1645   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
1646   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
1647   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
1648   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
1649 
1650   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1651   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1652   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1653   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1654   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1655   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1656   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1657   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1658   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
1659   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
1660   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
1661   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
1662 
1663   sum0 = vqrshlq_s32(sum0, shift_s32);
1664   sum1 = vqrshlq_s32(sum1, shift_s32);
1665 
1666   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1667 }
1668 
highbd_convolve_2d_sr_horiz_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1669 static inline void highbd_convolve_2d_sr_horiz_12tap_neon(
1670     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1671     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1672     const int offset) {
1673   // The smallest block height processed by the SIMD functions is 4, and the
1674   // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1675   // for the vertical convolution.
1676   assert(h >= 5);
1677   const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1678   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1679   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1680   const int32x4_t offset_s32 = vdupq_n_s32(offset);
1681 
1682   if (w == 4) {
1683     const int16_t *s = (const int16_t *)src_ptr;
1684     uint16_t *d = dst_ptr;
1685 
1686     do {
1687       int16x4_t s0[12], s1[12], s2[12], s3[12];
1688       load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1689                     &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1690                     &s0[11]);
1691       load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1692                     &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1693                     &s1[11]);
1694       load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1695                     &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1696                     &s2[11]);
1697       load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1698                     &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1699                     &s3[11]);
1700 
1701       uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1702                                                shift_s32, offset_s32);
1703       uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
1704                                                shift_s32, offset_s32);
1705       uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
1706                                                shift_s32, offset_s32);
1707       uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
1708                                                shift_s32, offset_s32);
1709 
1710       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1711 
1712       s += 4 * src_stride;
1713       d += 4 * dst_stride;
1714       h -= 4;
1715     } while (h > 4);
1716 
1717     do {
1718       int16x4_t s0[12];
1719       load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
1720                     &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
1721 
1722       uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1723                                                shift_s32, offset_s32);
1724 
1725       vst1_u16(d, d0);
1726 
1727       s += src_stride;
1728       d += dst_stride;
1729     } while (--h != 0);
1730   } else {
1731     int height = h;
1732 
1733     do {
1734       int width = w;
1735       const int16_t *s = (const int16_t *)src_ptr;
1736       uint16_t *d = dst_ptr;
1737 
1738       do {
1739         int16x8_t s0[12], s1[12], s2[12], s3[12];
1740         load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1741                       &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1742                       &s0[11]);
1743         load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1744                       &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1745                       &s1[11]);
1746         load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1747                       &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1748                       &s2[11]);
1749         load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1750                       &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1751                       &s3[11]);
1752 
1753         uint16x8_t d0 = highbd_convolve12_8_2d_h(
1754             s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1755         uint16x8_t d1 = highbd_convolve12_8_2d_h(
1756             s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1757         uint16x8_t d2 = highbd_convolve12_8_2d_h(
1758             s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1759         uint16x8_t d3 = highbd_convolve12_8_2d_h(
1760             s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1761 
1762         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1763 
1764         s += 8;
1765         d += 8;
1766         width -= 8;
1767       } while (width != 0);
1768       src_ptr += 4 * src_stride;
1769       dst_ptr += 4 * dst_stride;
1770       height -= 4;
1771     } while (height > 4);
1772 
1773     do {
1774       int width = w;
1775       const int16_t *s = (const int16_t *)src_ptr;
1776       uint16_t *d = dst_ptr;
1777 
1778       do {
1779         int16x8_t s0[12];
1780         load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1781                       &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1782                       &s0[11]);
1783 
1784         uint16x8_t d0 = highbd_convolve12_8_2d_h(
1785             s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1786         vst1q_u16(d, d0);
1787 
1788         s += 8;
1789         d += 8;
1790         width -= 8;
1791       } while (width > 0);
1792       src_ptr += src_stride;
1793       dst_ptr += dst_stride;
1794     } while (--height != 0);
1795   }
1796 }
1797 
av1_highbd_convolve_2d_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1798 void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
1799                                     uint16_t *dst, int dst_stride, int w, int h,
1800                                     const InterpFilterParams *filter_params_x,
1801                                     const InterpFilterParams *filter_params_y,
1802                                     const int subpel_x_qn,
1803                                     const int subpel_y_qn,
1804                                     ConvolveParams *conv_params, int bd) {
1805   if (w == 2 || h == 2) {
1806     av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1807                                 filter_params_x, filter_params_y, subpel_x_qn,
1808                                 subpel_y_qn, conv_params, bd);
1809     return;
1810   }
1811   DECLARE_ALIGNED(16, uint16_t,
1812                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1813   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1814   const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1815 
1816   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1817   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1818   const int im_h = h + clamped_y_taps - 1;
1819   const int im_stride = MAX_SB_SIZE;
1820   const int vert_offset = clamped_y_taps / 2 - 1;
1821   const int horiz_offset = clamped_x_taps / 2 - 1;
1822   const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
1823   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1824   // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
1825   // simple shift left instead of a rounding saturating shift left.
1826   const int y_offset =
1827       (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
1828 
1829   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1830 
1831   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1832       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1833   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1834       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1835 
1836   if (x_filter_taps > 8) {
1837     highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
1838                                            im_stride, w, im_h, x_filter_ptr,
1839                                            conv_params, x_offset_initial);
1840 
1841     highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
1842                                           w, h, y_filter_ptr, conv_params, bd,
1843                                           y_offset);
1844     return;
1845   }
1846   if (x_filter_taps <= 6 && w != 4) {
1847     highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
1848                                           im_stride, w, im_h, x_filter_ptr,
1849                                           conv_params, x_offset_initial);
1850   } else {
1851     highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
1852                                      w, im_h, x_filter_ptr, conv_params,
1853                                      x_offset_initial);
1854   }
1855 
1856   if (y_filter_taps <= 6) {
1857     highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
1858                                          w, h, y_filter_ptr, conv_params, bd,
1859                                          y_offset);
1860   } else {
1861     highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
1862                                          w, h, y_filter_ptr, conv_params, bd,
1863                                          y_offset);
1864   }
1865 }
1866 
1867 // Filter used is [64, 64].
av1_highbd_convolve_x_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)1868 void av1_highbd_convolve_x_sr_intrabc_neon(
1869     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1870     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
1871     ConvolveParams *conv_params, int bd) {
1872   assert(subpel_x_qn == 8);
1873   assert(filter_params_x->taps == 2);
1874   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1875   (void)filter_params_x;
1876   (void)subpel_x_qn;
1877   (void)conv_params;
1878   (void)bd;
1879 
1880   if (w <= 4) {
1881     do {
1882       uint16x4_t s0 = vld1_u16(src);
1883       uint16x4_t s1 = vld1_u16(src + 1);
1884 
1885       uint16x4_t d0 = vrhadd_u16(s0, s1);
1886 
1887       if (w == 2) {
1888         store_u16_2x1(dst, d0);
1889       } else {
1890         vst1_u16(dst, d0);
1891       }
1892 
1893       src += src_stride;
1894       dst += dst_stride;
1895     } while (--h != 0);
1896   } else {
1897     do {
1898       const uint16_t *src_ptr = src;
1899       uint16_t *dst_ptr = dst;
1900       int width = w;
1901 
1902       do {
1903         uint16x8_t s0 = vld1q_u16(src_ptr);
1904         uint16x8_t s1 = vld1q_u16(src_ptr + 1);
1905 
1906         uint16x8_t d0 = vrhaddq_u16(s0, s1);
1907 
1908         vst1q_u16(dst_ptr, d0);
1909 
1910         src_ptr += 8;
1911         dst_ptr += 8;
1912         width -= 8;
1913       } while (width != 0);
1914       src += src_stride;
1915       dst += dst_stride;
1916     } while (--h != 0);
1917   }
1918 }
1919 
1920 // Filter used is [64, 64].
av1_highbd_convolve_y_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)1921 void av1_highbd_convolve_y_sr_intrabc_neon(
1922     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1923     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
1924     int bd) {
1925   assert(subpel_y_qn == 8);
1926   assert(filter_params_y->taps == 2);
1927   (void)filter_params_y;
1928   (void)subpel_y_qn;
1929   (void)bd;
1930 
1931   if (w <= 4) {
1932     do {
1933       uint16x4_t s0 = vld1_u16(src);
1934       uint16x4_t s1 = vld1_u16(src + src_stride);
1935 
1936       uint16x4_t d0 = vrhadd_u16(s0, s1);
1937 
1938       if (w == 2) {
1939         store_u16_2x1(dst, d0);
1940       } else {
1941         vst1_u16(dst, d0);
1942       }
1943 
1944       src += src_stride;
1945       dst += dst_stride;
1946     } while (--h != 0);
1947   } else {
1948     do {
1949       const uint16_t *src_ptr = src;
1950       uint16_t *dst_ptr = dst;
1951       int height = h;
1952 
1953       do {
1954         uint16x8_t s0 = vld1q_u16(src_ptr);
1955         uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
1956 
1957         uint16x8_t d0 = vrhaddq_u16(s0, s1);
1958 
1959         vst1q_u16(dst_ptr, d0);
1960 
1961         src_ptr += src_stride;
1962         dst_ptr += dst_stride;
1963       } while (--height != 0);
1964       src += 8;
1965       dst += 8;
1966       w -= 8;
1967     } while (w != 0);
1968   }
1969 }
1970 
1971 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
av1_highbd_convolve_2d_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1972 void av1_highbd_convolve_2d_sr_intrabc_neon(
1973     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1974     int h, const InterpFilterParams *filter_params_x,
1975     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1976     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1977   assert(subpel_x_qn == 8);
1978   assert(subpel_y_qn == 8);
1979   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1980   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1981   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
1982   (void)filter_params_x;
1983   (void)subpel_x_qn;
1984   (void)filter_params_y;
1985   (void)subpel_y_qn;
1986   (void)conv_params;
1987   (void)bd;
1988 
1989   DECLARE_ALIGNED(16, uint16_t,
1990                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1991   int im_h = h + 1;
1992   int im_stride = MAX_SB_SIZE;
1993 
1994   uint16x8_t vert_offset = vdupq_n_u16(1);
1995 
1996   uint16_t *im = im_block;
1997 
1998   // Horizontal filter.
1999   if (w <= 4) {
2000     do {
2001       uint16x4_t s0 = vld1_u16(src);
2002       uint16x4_t s1 = vld1_u16(src + 1);
2003 
2004       uint16x4_t d0 = vadd_u16(s0, s1);
2005 
2006       // Safe to store the whole vector, the im buffer is big enough.
2007       vst1_u16(im, d0);
2008 
2009       src += src_stride;
2010       im += im_stride;
2011     } while (--im_h != 0);
2012   } else {
2013     do {
2014       const uint16_t *src_ptr = src;
2015       uint16_t *im_ptr = im;
2016       int width = w;
2017 
2018       do {
2019         uint16x8_t s0 = vld1q_u16(src_ptr);
2020         uint16x8_t s1 = vld1q_u16(src_ptr + 1);
2021 
2022         uint16x8_t d0 = vaddq_u16(s0, s1);
2023 
2024         vst1q_u16(im_ptr, d0);
2025 
2026         src_ptr += 8;
2027         im_ptr += 8;
2028         width -= 8;
2029       } while (width != 0);
2030       src += src_stride;
2031       im += im_stride;
2032     } while (--im_h != 0);
2033   }
2034 
2035   im = im_block;
2036 
2037   // Vertical filter.
2038   if (w <= 4) {
2039     do {
2040       uint16x4_t s0 = vld1_u16(im);
2041       uint16x4_t s1 = vld1_u16(im + im_stride);
2042 
2043       uint16x4_t d0 = vhadd_u16(s0, s1);
2044       d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
2045 
2046       if (w == 2) {
2047         store_u16_2x1(dst, d0);
2048       } else {
2049         vst1_u16(dst, d0);
2050       }
2051 
2052       im += im_stride;
2053       dst += dst_stride;
2054     } while (--h != 0);
2055   } else {
2056     do {
2057       uint16_t *im_ptr = im;
2058       uint16_t *dst_ptr = dst;
2059       int height = h;
2060 
2061       do {
2062         uint16x8_t s0 = vld1q_u16(im_ptr);
2063         uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
2064 
2065         uint16x8_t d0 = vhaddq_u16(s0, s1);
2066         d0 = vhaddq_u16(d0, vert_offset);
2067 
2068         vst1q_u16(dst_ptr, d0);
2069 
2070         im_ptr += im_stride;
2071         dst_ptr += dst_stride;
2072       } while (--height != 0);
2073       im += 8;
2074       dst += 8;
2075       w -= 8;
2076     } while (w != 0);
2077   }
2078 }
2079