xref: /aosp_15_r20/external/libaom/av1/common/arm/convolve_scale_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
13 #define AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
14 
15 #include <assert.h>
16 #include <arm_neon.h>
17 
18 #include "config/aom_config.h"
19 #include "config/av1_rtcd.h"
20 
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_dsp/arm/transpose_neon.h"
23 
compound_convolve8_4_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset_const)24 static inline int16x4_t compound_convolve8_4_v(
25     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
28     const int32x4_t offset_const) {
29   const int16x4_t filter_0_3 = vget_low_s16(filter);
30   const int16x4_t filter_4_7 = vget_high_s16(filter);
31 
32   int32x4_t sum = offset_const;
33   sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
34   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
35   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
36   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
37   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
38   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
39   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
40   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
41 
42   return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
43 }
44 
compound_convolve8_8_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset_const)45 static inline int16x8_t compound_convolve8_8_v(
46     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
47     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
48     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
49     const int32x4_t offset_const) {
50   const int16x4_t filter_0_3 = vget_low_s16(filter);
51   const int16x4_t filter_4_7 = vget_high_s16(filter);
52 
53   int32x4_t sum0 = offset_const;
54   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
55   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
56   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
57   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
58   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
59   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
60   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
61   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
62 
63   int32x4_t sum1 = offset_const;
64   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
65   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
66   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
67   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
68   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
69   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
70   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
71   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
72 
73   int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
74   int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
75 
76   return vcombine_s16(res0, res1);
77 }
78 
compound_convolve_vert_scale_8tap_neon(const int16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)79 static inline void compound_convolve_vert_scale_8tap_neon(
80     const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
81     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
82   const int bd = 8;
83   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
84   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
85   // non-rounding shifts - which are generally faster than rounding shifts on
86   // modern CPUs.
87   const int32x4_t vert_offset =
88       vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
89 
90   int y_qn = subpel_y_qn;
91 
92   if (w == 4) {
93     do {
94       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
95 
96       const ptrdiff_t filter_offset =
97           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
98       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
99 
100       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
101       load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
102 
103       int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
104                                             filter, vert_offset);
105 
106       vst1_u16(dst, vreinterpret_u16_s16(d0));
107 
108       dst += dst_stride;
109       y_qn += y_step_qn;
110     } while (--h != 0);
111   } else {
112     do {
113       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
114 
115       const ptrdiff_t filter_offset =
116           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
117       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
118 
119       int width = w;
120       uint16_t *d = dst;
121 
122       do {
123         int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
124         load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
125 
126         int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
127                                               filter, vert_offset);
128 
129         vst1q_u16(d, vreinterpretq_u16_s16(d0));
130 
131         s += 8;
132         d += 8;
133         width -= 8;
134       } while (width != 0);
135 
136       dst += dst_stride;
137       y_qn += y_step_qn;
138     } while (--h != 0);
139   }
140 }
141 
compound_avg_convolve_vert_scale_8tap_neon(const int16_t * src,int src_stride,uint8_t * dst8,int dst8_stride,uint16_t * dst16,int dst16_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)142 static inline void compound_avg_convolve_vert_scale_8tap_neon(
143     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
144     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
145     int subpel_y_qn, int y_step_qn) {
146   const int bd = 8;
147   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
148   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
149   // non-rounding shifts - which are generally faster than rounding shifts
150   // on modern CPUs.
151   const int32_t vert_offset_bits =
152       (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
153   // For the averaging code path substract round offset and convolve round.
154   const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
155   const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
156 
157   int y_qn = subpel_y_qn;
158 
159   if (w == 4) {
160     do {
161       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
162 
163       const ptrdiff_t filter_offset =
164           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
165       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
166 
167       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
168       load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
169 
170       int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
171                                             filter, vert_offset);
172 
173       int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
174 
175       int16x4_t avg = vhadd_s16(dd0, d0);
176       int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
177 
178       uint8x8_t d0_u8 = vqrshrun_n_s16(
179           d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
180 
181       store_u8_4x1(dst8, d0_u8);
182 
183       dst16 += dst16_stride;
184       dst8 += dst8_stride;
185       y_qn += y_step_qn;
186     } while (--h != 0);
187   } else {
188     do {
189       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
190 
191       const ptrdiff_t filter_offset =
192           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
193       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
194 
195       int width = w;
196       uint8_t *dst8_ptr = dst8;
197       uint16_t *dst16_ptr = dst16;
198 
199       do {
200         int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
201         load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
202 
203         int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
204                                               filter, vert_offset);
205 
206         int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
207 
208         int16x8_t avg = vhaddq_s16(dd0, d0);
209 
210         uint8x8_t d0_u8 = vqrshrun_n_s16(
211             avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
212 
213         vst1_u8(dst8_ptr, d0_u8);
214 
215         s += 8;
216         dst8_ptr += 8;
217         dst16_ptr += 8;
218         width -= 8;
219       } while (width != 0);
220 
221       dst16 += dst16_stride;
222       dst8 += dst8_stride;
223       y_qn += y_step_qn;
224     } while (--h != 0);
225   }
226 }
227 
compound_dist_wtd_convolve_vert_scale_8tap_neon(const int16_t * src,int src_stride,uint8_t * dst8,int dst8_stride,uint16_t * dst16,int dst16_stride,int w,int h,const int16_t * y_filter,ConvolveParams * conv_params,int subpel_y_qn,int y_step_qn)228 static inline void compound_dist_wtd_convolve_vert_scale_8tap_neon(
229     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
230     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
231     ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
232   const int bd = 8;
233   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
234   int y_qn = subpel_y_qn;
235   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
236   // non-rounding shifts - which are generally faster than rounding shifts on
237   // modern CPUs.
238   const int32x4_t vert_offset =
239       vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
240   // For the weighted averaging code path we have to substract round offset and
241   // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
242   // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
243   // additional shift by DIST_PRECISION_BITS is needed in order to merge two
244   // shift calculations into one.
245   const int32x4_t dist_wtd_offset = vdupq_n_s32(
246       (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
247              DIST_PRECISION_BITS)) -
248       (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
249       (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
250   const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
251   const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
252 
253   if (w == 4) {
254     do {
255       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
256 
257       const ptrdiff_t filter_offset =
258           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
259       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
260 
261       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
262       load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
263 
264       int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
265                                             filter, vert_offset);
266 
267       int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
268 
269       int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
270       dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
271 
272       int16x4_t d0_s16 = vshrn_n_s32(
273           dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
274                            DIST_PRECISION_BITS);
275 
276       uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
277 
278       store_u8_4x1(dst8, d0_u8);
279 
280       dst16 += dst16_stride;
281       dst8 += dst8_stride;
282       y_qn += y_step_qn;
283     } while (--h != 0);
284   } else {
285     do {
286       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
287 
288       const ptrdiff_t filter_offset =
289           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
290       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
291 
292       int width = w;
293       uint8_t *dst8_ptr = dst8;
294       uint16_t *dst16_ptr = dst16;
295 
296       do {
297         int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
298         load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
299 
300         int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
301                                               filter, vert_offset);
302 
303         int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
304 
305         int32x4_t dst_wtd_avg0 =
306             vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
307         int32x4_t dst_wtd_avg1 =
308             vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
309 
310         dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
311         dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
312 
313         int16x4_t d0_s16_0 = vshrn_n_s32(
314             dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
315                               DIST_PRECISION_BITS);
316         int16x4_t d0_s16_1 = vshrn_n_s32(
317             dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
318                               DIST_PRECISION_BITS);
319 
320         uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
321 
322         vst1_u8(dst8_ptr, d0_u8);
323 
324         s += 8;
325         dst8_ptr += 8;
326         dst16_ptr += 8;
327         width -= 8;
328       } while (width != 0);
329 
330       dst16 += dst16_stride;
331       dst8 += dst8_stride;
332       y_qn += y_step_qn;
333     } while (--h != 0);
334   }
335 }
336 
convolve8_4_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset_const)337 static inline uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
338                                       const int16x4_t s2, const int16x4_t s3,
339                                       const int16x4_t s4, const int16x4_t s5,
340                                       const int16x4_t s6, const int16x4_t s7,
341                                       const int16x8_t filter,
342                                       const int32x4_t offset_const) {
343   const int16x4_t filter_0_3 = vget_low_s16(filter);
344   const int16x4_t filter_4_7 = vget_high_s16(filter);
345 
346   int32x4_t sum = offset_const;
347   sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
348   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
349   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
350   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
351   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
352   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
353   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
354   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
355 
356   int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
357 
358   return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
359 }
360 
convolve8_8_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset_const)361 static inline uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
362                                       const int16x8_t s2, const int16x8_t s3,
363                                       const int16x8_t s4, const int16x8_t s5,
364                                       const int16x8_t s6, const int16x8_t s7,
365                                       const int16x8_t filter,
366                                       const int32x4_t offset_const) {
367   const int16x4_t filter_0_3 = vget_low_s16(filter);
368   const int16x4_t filter_4_7 = vget_high_s16(filter);
369 
370   int32x4_t sum0 = offset_const;
371   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
372   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
373   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
374   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
375   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
376   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
377   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
378   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
379 
380   int32x4_t sum1 = offset_const;
381   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
382   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
383   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
384   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
385   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
386   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
387   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
388   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
389 
390   int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
391   int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
392 
393   return vqmovun_s16(vcombine_s16(res0, res1));
394 }
395 
convolve_vert_scale_8tap_neon(const int16_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)396 static inline void convolve_vert_scale_8tap_neon(
397     const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
398     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
399   const int bd = 8;
400   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
401   const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
402   // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
403   int32x4_t vert_offset =
404       vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
405 
406   int y_qn = subpel_y_qn;
407   if (w == 4) {
408     do {
409       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
410 
411       const ptrdiff_t filter_offset =
412           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
413       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
414 
415       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
416       load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
417 
418       uint8x8_t d =
419           convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
420 
421       store_u8_4x1(dst, d);
422 
423       dst += dst_stride;
424       y_qn += y_step_qn;
425     } while (--h != 0);
426   } else if (w == 8) {
427     do {
428       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
429 
430       const ptrdiff_t filter_offset =
431           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
432       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
433 
434       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
435       load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
436 
437       uint8x8_t d =
438           convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
439 
440       vst1_u8(dst, d);
441 
442       dst += dst_stride;
443       y_qn += y_step_qn;
444     } while (--h != 0);
445   } else {
446     do {
447       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
448       uint8_t *d = dst;
449       int width = w;
450 
451       const ptrdiff_t filter_offset =
452           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
453       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
454 
455       do {
456         int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
457         load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
458                      &s5[0], &s6[0], &s7[0]);
459         load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
460                      &s5[1], &s6[1], &s7[1]);
461 
462         uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
463                                      s6[0], s7[0], filter, vert_offset);
464         uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
465                                      s6[1], s7[1], filter, vert_offset);
466 
467         vst1q_u8(d, vcombine_u8(d0, d1));
468 
469         s += 16;
470         d += 16;
471         width -= 16;
472       } while (width != 0);
473 
474       dst += dst_stride;
475       y_qn += y_step_qn;
476     } while (--h != 0);
477   }
478 }
479 
compound_convolve6_4_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset_const)480 static inline int16x4_t compound_convolve6_4_v(
481     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
482     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
483     const int16x8_t filter, const int32x4_t offset_const) {
484   const int16x4_t filter_0_3 = vget_low_s16(filter);
485   const int16x4_t filter_4_7 = vget_high_s16(filter);
486 
487   int32x4_t sum = offset_const;
488   // Filter values at indices 0 and 7 are 0.
489   sum = vmlal_lane_s16(sum, s0, filter_0_3, 1);
490   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
491   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
492   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
493   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
494   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
495 
496   return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
497 }
498 
compound_convolve6_8_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset_const)499 static inline int16x8_t compound_convolve6_8_v(
500     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
501     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
502     const int16x8_t filter, const int32x4_t offset_const) {
503   const int16x4_t filter_0_3 = vget_low_s16(filter);
504   const int16x4_t filter_4_7 = vget_high_s16(filter);
505 
506   int32x4_t sum0 = offset_const;
507   // Filter values at indices 0 and 7 are 0.
508   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1);
509   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
510   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
511   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
512   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
513   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
514 
515   int32x4_t sum1 = offset_const;
516   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1);
517   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
518   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
519   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
520   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
521   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
522 
523   int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
524   int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
525 
526   return vcombine_s16(res0, res1);
527 }
528 
compound_convolve_vert_scale_6tap_neon(const int16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)529 static inline void compound_convolve_vert_scale_6tap_neon(
530     const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
531     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
532   const int bd = 8;
533   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
534   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
535   // non-rounding shifts - which are generally faster than rounding shifts on
536   // modern CPUs.
537   const int32x4_t vert_offset =
538       vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
539 
540   int y_qn = subpel_y_qn;
541 
542   if (w == 4) {
543     do {
544       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
545 
546       const ptrdiff_t filter_offset =
547           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
548       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
549 
550       int16x4_t s0, s1, s2, s3, s4, s5;
551       load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
552 
553       int16x4_t d0 =
554           compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
555 
556       vst1_u16(dst, vreinterpret_u16_s16(d0));
557 
558       dst += dst_stride;
559       y_qn += y_step_qn;
560     } while (--h != 0);
561   } else {
562     do {
563       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
564 
565       const ptrdiff_t filter_offset =
566           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
567       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
568 
569       int width = w;
570       uint16_t *d = dst;
571 
572       do {
573         int16x8_t s0, s1, s2, s3, s4, s5;
574         load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
575 
576         int16x8_t d0 =
577             compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
578 
579         vst1q_u16(d, vreinterpretq_u16_s16(d0));
580 
581         s += 8;
582         d += 8;
583         width -= 8;
584       } while (width != 0);
585 
586       dst += dst_stride;
587       y_qn += y_step_qn;
588     } while (--h != 0);
589   }
590 }
591 
compound_avg_convolve_vert_scale_6tap_neon(const int16_t * src,int src_stride,uint8_t * dst8,int dst8_stride,uint16_t * dst16,int dst16_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)592 static inline void compound_avg_convolve_vert_scale_6tap_neon(
593     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
594     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
595     int subpel_y_qn, int y_step_qn) {
596   const int bd = 8;
597   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
598   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
599   // non-rounding shifts - which are generally faster than rounding shifts
600   // on modern CPUs.
601   const int32_t vert_offset_bits =
602       (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
603   // For the averaging code path substract round offset and convolve round.
604   const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
605   const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
606 
607   int y_qn = subpel_y_qn;
608 
609   if (w == 4) {
610     do {
611       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
612 
613       const ptrdiff_t filter_offset =
614           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
615       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
616 
617       int16x4_t s0, s1, s2, s3, s4, s5;
618       load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
619 
620       int16x4_t d0 =
621           compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
622 
623       int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
624 
625       int16x4_t avg = vhadd_s16(dd0, d0);
626       int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
627 
628       uint8x8_t d0_u8 = vqrshrun_n_s16(
629           d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
630 
631       store_u8_4x1(dst8, d0_u8);
632 
633       dst16 += dst16_stride;
634       dst8 += dst8_stride;
635       y_qn += y_step_qn;
636     } while (--h != 0);
637   } else {
638     do {
639       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
640 
641       const ptrdiff_t filter_offset =
642           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
643       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
644 
645       int width = w;
646       uint8_t *dst8_ptr = dst8;
647       uint16_t *dst16_ptr = dst16;
648 
649       do {
650         int16x8_t s0, s1, s2, s3, s4, s5;
651         load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
652 
653         int16x8_t d0 =
654             compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
655 
656         int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
657 
658         int16x8_t avg = vhaddq_s16(dd0, d0);
659 
660         uint8x8_t d0_u8 = vqrshrun_n_s16(
661             avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
662 
663         vst1_u8(dst8_ptr, d0_u8);
664 
665         s += 8;
666         dst8_ptr += 8;
667         dst16_ptr += 8;
668         width -= 8;
669       } while (width != 0);
670 
671       dst16 += dst16_stride;
672       dst8 += dst8_stride;
673       y_qn += y_step_qn;
674     } while (--h != 0);
675   }
676 }
677 
compound_dist_wtd_convolve_vert_scale_6tap_neon(const int16_t * src,int src_stride,uint8_t * dst8,int dst8_stride,uint16_t * dst16,int dst16_stride,int w,int h,const int16_t * y_filter,ConvolveParams * conv_params,int subpel_y_qn,int y_step_qn)678 static inline void compound_dist_wtd_convolve_vert_scale_6tap_neon(
679     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
680     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
681     ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
682   const int bd = 8;
683   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
684   int y_qn = subpel_y_qn;
685   // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
686   // non-rounding shifts - which are generally faster than rounding shifts on
687   // modern CPUs.
688   const int32x4_t vert_offset =
689       vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
690   // For the weighted averaging code path we have to substract round offset and
691   // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
692   // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
693   // additional shift by DIST_PRECISION_BITS is needed in order to merge two
694   // shift calculations into one.
695   const int32x4_t dist_wtd_offset = vdupq_n_s32(
696       (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
697              DIST_PRECISION_BITS)) -
698       (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
699       (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
700   const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
701   const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
702 
703   if (w == 4) {
704     do {
705       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
706 
707       const ptrdiff_t filter_offset =
708           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
709       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
710 
711       int16x4_t s0, s1, s2, s3, s4, s5;
712       load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
713 
714       int16x4_t d0 =
715           compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
716 
717       int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
718 
719       int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
720       dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
721 
722       int16x4_t d0_s16 = vshrn_n_s32(
723           dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
724                            DIST_PRECISION_BITS);
725 
726       uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
727 
728       store_u8_4x1(dst8, d0_u8);
729 
730       dst16 += dst16_stride;
731       dst8 += dst8_stride;
732       y_qn += y_step_qn;
733     } while (--h != 0);
734   } else {
735     do {
736       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
737 
738       const ptrdiff_t filter_offset =
739           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
740       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
741 
742       int width = w;
743       uint8_t *dst8_ptr = dst8;
744       uint16_t *dst16_ptr = dst16;
745 
746       do {
747         int16x8_t s0, s1, s2, s3, s4, s5;
748         load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
749 
750         int16x8_t d0 =
751             compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
752 
753         int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
754 
755         int32x4_t dst_wtd_avg0 =
756             vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
757         int32x4_t dst_wtd_avg1 =
758             vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
759 
760         dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
761         dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
762 
763         int16x4_t d0_s16_0 = vshrn_n_s32(
764             dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
765                               DIST_PRECISION_BITS);
766         int16x4_t d0_s16_1 = vshrn_n_s32(
767             dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
768                               DIST_PRECISION_BITS);
769 
770         uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
771 
772         vst1_u8(dst8_ptr, d0_u8);
773 
774         s += 8;
775         dst8_ptr += 8;
776         dst16_ptr += 8;
777         width -= 8;
778       } while (width != 0);
779 
780       dst16 += dst16_stride;
781       dst8 += dst8_stride;
782       y_qn += y_step_qn;
783     } while (--h != 0);
784   }
785 }
786 
convolve6_4_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset_const)787 static inline uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1,
788                                       const int16x4_t s2, const int16x4_t s3,
789                                       const int16x4_t s4, const int16x4_t s5,
790                                       const int16x8_t filter,
791                                       const int32x4_t offset_const) {
792   const int16x4_t filter_0_3 = vget_low_s16(filter);
793   const int16x4_t filter_4_7 = vget_high_s16(filter);
794 
795   int32x4_t sum = offset_const;
796   // Filter values at indices 0 and 7 are 0.
797   sum = vmlal_lane_s16(sum, s0, filter_0_3, 1);
798   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
799   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
800   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
801   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
802   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
803 
804   int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
805 
806   return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
807 }
808 
convolve6_8_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset_const)809 static inline uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1,
810                                       const int16x8_t s2, const int16x8_t s3,
811                                       const int16x8_t s4, const int16x8_t s5,
812                                       const int16x8_t filter,
813                                       const int32x4_t offset_const) {
814   const int16x4_t filter_0_3 = vget_low_s16(filter);
815   const int16x4_t filter_4_7 = vget_high_s16(filter);
816 
817   int32x4_t sum0 = offset_const;
818   // Filter values at indices 0 and 7 are 0.
819   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1);
820   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
821   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
822   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
823   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
824   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
825 
826   int32x4_t sum1 = offset_const;
827   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1);
828   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
829   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
830   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
831   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
832   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
833 
834   int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
835   int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
836 
837   return vqmovun_s16(vcombine_s16(res0, res1));
838 }
839 
convolve_vert_scale_6tap_neon(const int16_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * y_filter,int subpel_y_qn,int y_step_qn)840 static inline void convolve_vert_scale_6tap_neon(
841     const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
842     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
843   const int bd = 8;
844   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
845   const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
846   // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
847   int32x4_t vert_offset =
848       vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
849 
850   int y_qn = subpel_y_qn;
851   if (w == 4) {
852     do {
853       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
854 
855       const ptrdiff_t filter_offset =
856           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
857       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
858 
859       int16x4_t s0, s1, s2, s3, s4, s5;
860       load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
861 
862       uint8x8_t d = convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
863 
864       store_u8_4x1(dst, d);
865 
866       dst += dst_stride;
867       y_qn += y_step_qn;
868     } while (--h != 0);
869   } else if (w == 8) {
870     do {
871       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
872 
873       const ptrdiff_t filter_offset =
874           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
875       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
876 
877       int16x8_t s0, s1, s2, s3, s4, s5;
878       load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
879 
880       uint8x8_t d = convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
881 
882       vst1_u8(dst, d);
883 
884       dst += dst_stride;
885       y_qn += y_step_qn;
886     } while (--h != 0);
887   } else {
888     do {
889       const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
890       uint8_t *d = dst;
891       int width = w;
892 
893       const ptrdiff_t filter_offset =
894           SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
895       const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
896 
897       do {
898         int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2];
899         load_s16_8x6(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
900                      &s5[0]);
901         load_s16_8x6(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
902                      &s5[1]);
903 
904         uint8x8_t d0 = convolve6_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
905                                      filter, vert_offset);
906         uint8x8_t d1 = convolve6_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
907                                      filter, vert_offset);
908 
909         vst1q_u8(d, vcombine_u8(d0, d1));
910 
911         s += 16;
912         d += 16;
913         width -= 16;
914       } while (width != 0);
915 
916       dst += dst_stride;
917       y_qn += y_step_qn;
918     } while (--h != 0);
919   }
920 }
921 
922 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
923