1 /*
2 *
3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <assert.h>
14 #include <arm_neon.h>
15
16 #include "config/aom_config.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/transpose_neon.h"
22 #include "aom_ports/mem.h"
23 #include "av1/common/convolve.h"
24 #include "av1/common/filter.h"
25 #include "av1/common/arm/convolve_neon.h"
26
convolve12_4_x(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)27 static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
28 const int16x4_t s2, const int16x4_t s3,
29 const int16x4_t s4, const int16x4_t s5,
30 const int16x4_t s6, const int16x4_t s7,
31 const int16x4_t s8, const int16x4_t s9,
32 const int16x4_t s10, const int16x4_t s11,
33 const int16x8_t x_filter_0_7,
34 const int16x4_t x_filter_8_11,
35 const int32x4_t horiz_const) {
36 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
37 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
38
39 int32x4_t sum = horiz_const;
40 sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
41 sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
42 sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
43 sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
44 sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
45 sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
46 sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
47 sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
48 sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
49 sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
50 sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
51 sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
52
53 return vqrshrn_n_s32(sum, FILTER_BITS);
54 }
55
convolve_x_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)56 static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
57 int src_stride, uint8_t *dst_ptr,
58 const int dst_stride, int w, int h,
59 const int16_t *x_filter_ptr) {
60 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
61 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
62
63 // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
64 // shift by FILTER_BITS - instead of a first rounding right shift by
65 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
66 // ROUND0_BITS.
67 const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
68
69 #if AOM_ARCH_AARCH64
70 do {
71 const uint8_t *s = src_ptr;
72 uint8_t *d = dst_ptr;
73 int width = w;
74
75 uint8x8_t t0, t1, t2, t3;
76 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
77 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
78
79 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
80 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
81 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
82 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
83 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
84 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
85 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
86 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
87
88 load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
89 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
90
91 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
92 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
93 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
94
95 s += 11;
96
97 do {
98 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
99 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
100
101 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
102 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
103 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
104 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
105
106 int16x4_t d0 =
107 convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
108 x_filter_0_7, x_filter_8_11, horiz_const);
109 int16x4_t d1 =
110 convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
111 x_filter_0_7, x_filter_8_11, horiz_const);
112 int16x4_t d2 =
113 convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
114 x_filter_0_7, x_filter_8_11, horiz_const);
115 int16x4_t d3 =
116 convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
117 x_filter_0_7, x_filter_8_11, horiz_const);
118
119 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
120
121 uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
122 uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
123
124 store_u8x4_strided_x2(d, dst_stride, d01);
125 store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
126
127 s0 = s4;
128 s1 = s5;
129 s2 = s6;
130 s3 = s7;
131 s4 = s8;
132 s5 = s9;
133 s6 = s10;
134 s7 = s11;
135 s8 = s12;
136 s9 = s13;
137 s10 = s14;
138 s += 4;
139 d += 4;
140 width -= 4;
141 } while (width != 0);
142 src_ptr += 4 * src_stride;
143 dst_ptr += 4 * dst_stride;
144 h -= 4;
145 } while (h != 0);
146
147 #else // !AOM_ARCH_AARCH64
148 do {
149 const uint8_t *s = src_ptr;
150 uint8_t *d = dst_ptr;
151 int width = w;
152
153 do {
154 uint8x16_t t0 = vld1q_u8(s);
155 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
156 int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
157
158 int16x4_t s0 = vget_low_s16(tt0);
159 int16x4_t s4 = vget_high_s16(tt0);
160 int16x4_t s8 = vget_low_s16(tt8);
161 int16x4_t s12 = vget_high_s16(tt8);
162
163 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
164 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
165 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
166 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
167 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
168 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
169 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
170 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
171 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
172
173 int16x4_t d0 =
174 convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
175 x_filter_0_7, x_filter_8_11, horiz_const);
176
177 uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
178
179 store_u8_4x1(d, dd0);
180
181 s += 4;
182 d += 4;
183 width -= 4;
184 } while (width != 0);
185 src_ptr += src_stride;
186 dst_ptr += dst_stride;
187 } while (--h != 0);
188 #endif // AOM_ARCH_AARCH64
189 }
190
convolve4_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,int16x8_t horiz_const)191 static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
192 const int16x8_t s2, const int16x8_t s3,
193 const int16x4_t filter,
194 int16x8_t horiz_const) {
195 int16x8_t sum = horiz_const;
196 sum = vmlaq_lane_s16(sum, s0, filter, 0);
197 sum = vmlaq_lane_s16(sum, s1, filter, 1);
198 sum = vmlaq_lane_s16(sum, s2, filter, 2);
199 sum = vmlaq_lane_s16(sum, s3, filter, 3);
200 // We halved the filter values so -1 from right shift.
201 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
202 }
203
convolve_x_sr_4tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)204 static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
205 int src_stride, uint8_t *dst_ptr,
206 const int dst_stride, int w, int h,
207 const int16_t *x_filter_ptr) {
208 // All filter values are even, halve to reduce intermediate precision
209 // requirements.
210 const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
211
212 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
213 // rounding right shift by FILTER_BITS - instead of a first rounding right
214 // shift by ROUND0_BITS, followed by second rounding right shift by
215 // FILTER_BITS - ROUND0_BITS.
216 // The outermost -1 is needed because we will halve the filter values.
217 const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
218
219 if (w == 4) {
220 do {
221 uint8x8_t t01[4];
222 t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
223 t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
224 t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
225 t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
226
227 int16x8_t s01[4];
228 s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
229 s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
230 s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
231 s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
232
233 uint8x8_t d01 =
234 convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
235
236 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
237
238 src_ptr += 2 * src_stride;
239 dst_ptr += 2 * dst_stride;
240 h -= 2;
241 } while (h != 0);
242 } else {
243 do {
244 int width = w;
245 const uint8_t *s = src_ptr;
246 uint8_t *d = dst_ptr;
247
248 do {
249 uint8x8_t t0[4], t1[4];
250 load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
251 load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
252
253 int16x8_t s0[4], s1[4];
254 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
255 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
256 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
257 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
258
259 s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
260 s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
261 s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
262 s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
263
264 uint8x8_t d0 =
265 convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
266 uint8x8_t d1 =
267 convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
268
269 store_u8_8x2(d, dst_stride, d0, d1);
270
271 s += 8;
272 d += 8;
273 width -= 8;
274 } while (width != 0);
275 src_ptr += 2 * src_stride;
276 dst_ptr += 2 * dst_stride;
277 h -= 2;
278 } while (h != 0);
279 }
280 }
281
convolve8_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)282 static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
283 const int16x8_t s2, const int16x8_t s3,
284 const int16x8_t s4, const int16x8_t s5,
285 const int16x8_t s6, const int16x8_t s7,
286 const int16x8_t filter,
287 const int16x8_t horiz_const) {
288 const int16x4_t filter_lo = vget_low_s16(filter);
289 const int16x4_t filter_hi = vget_high_s16(filter);
290
291 int16x8_t sum = horiz_const;
292 sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
293 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
294 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
295 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
296 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
297 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
298 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
299 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
300
301 // We halved the convolution filter values so - 1 from the right shift.
302 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
303 }
304
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)305 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
306 int dst_stride, int w, int h,
307 const InterpFilterParams *filter_params_x,
308 const int subpel_x_qn,
309 ConvolveParams *conv_params) {
310 if (w == 2 || h == 2) {
311 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
312 subpel_x_qn, conv_params);
313 return;
314 }
315
316 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
317 src -= horiz_offset;
318
319 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
320 filter_params_x, subpel_x_qn & SUBPEL_MASK);
321
322 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
323
324 if (filter_taps > 8) {
325 convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
326 x_filter_ptr);
327 return;
328 }
329
330 if (filter_taps <= 4) {
331 convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
332 x_filter_ptr);
333 return;
334 }
335
336 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
337 // rounding right shift by FILTER_BITS - instead of a first rounding right
338 // shift by ROUND0_BITS, followed by second rounding right shift by
339 // FILTER_BITS - ROUND0_BITS.
340 // The outermost -1 is needed because we will halve the filter values.
341 const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
342
343 // Filter values are even so halve to reduce precision requirements.
344 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
345
346 #if AOM_ARCH_AARCH64
347 while (h >= 8) {
348 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
349 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
350
351 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
352 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
353 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
354 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
355 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
356 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
357 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
358 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
359
360 int width = w;
361 const uint8_t *s = src + 7;
362 uint8_t *d = dst;
363
364 __builtin_prefetch(d + 0 * dst_stride);
365 __builtin_prefetch(d + 1 * dst_stride);
366 __builtin_prefetch(d + 2 * dst_stride);
367 __builtin_prefetch(d + 3 * dst_stride);
368 __builtin_prefetch(d + 4 * dst_stride);
369 __builtin_prefetch(d + 5 * dst_stride);
370 __builtin_prefetch(d + 6 * dst_stride);
371 __builtin_prefetch(d + 7 * dst_stride);
372
373 do {
374 uint8x8_t t8, t9, t10, t11, t12, t13, t14;
375 load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
376
377 transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
378 &t14);
379 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
380 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
381 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
382 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
383 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
384 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
385 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
386 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
387
388 uint8x8_t d0 =
389 convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
390 uint8x8_t d1 =
391 convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
392 uint8x8_t d2 =
393 convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
394 uint8x8_t d3 =
395 convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
396 uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
397 horiz_const);
398 uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
399 horiz_const);
400 uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
401 horiz_const);
402 uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
403 x_filter, horiz_const);
404
405 transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
406
407 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
408
409 s0 = s8;
410 s1 = s9;
411 s2 = s10;
412 s3 = s11;
413 s4 = s12;
414 s5 = s13;
415 s6 = s14;
416 s += 8;
417 d += 8;
418 width -= 8;
419 } while (width != 0);
420 src += 8 * src_stride;
421 dst += 8 * dst_stride;
422 h -= 8;
423 }
424 #endif // AOM_ARCH_AARCH64
425
426 while (h-- != 0) {
427 uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
428 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
429
430 int width = w;
431 const uint8_t *s = src + 8;
432 uint8_t *d = dst;
433
434 __builtin_prefetch(d);
435
436 do {
437 uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
438 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
439
440 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
441 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
442 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
443 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
444 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
445 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
446 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
447
448 uint8x8_t d0 =
449 convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
450
451 vst1_u8(d, d0);
452
453 s0 = s8;
454 s += 8;
455 d += 8;
456 width -= 8;
457 } while (width != 0);
458 src += src_stride;
459 dst += dst_stride;
460 }
461 }
462
convolve4_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter)463 static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
464 const int16x8_t s2, const int16x8_t s3,
465 const int16x4_t filter) {
466 int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
467 sum = vmlaq_lane_s16(sum, s1, filter, 1);
468 sum = vmlaq_lane_s16(sum, s2, filter, 2);
469 sum = vmlaq_lane_s16(sum, s3, filter, 3);
470
471 // We halved the filter values so -1 from right shift.
472 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
473 }
474
convolve_y_sr_4tap_neon(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,int w,int h,const int16_t * filter_y)475 static inline void convolve_y_sr_4tap_neon(const uint8_t *src,
476 const int src_stride, uint8_t *dst,
477 const int dst_stride, int w, int h,
478 const int16_t *filter_y) {
479 // All filter values are even, halve to reduce intermediate precision
480 // requirements.
481 const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
482
483 if (w == 4) {
484 uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
485 uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
486
487 int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
488 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
489
490 src += 2 * src_stride;
491
492 do {
493 uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
494 uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
495 uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
496 uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
497
498 int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
499 int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
500 int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
501 int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
502
503 uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
504 uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
505
506 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
507 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
508
509 s01 = s45;
510 s12 = s56;
511
512 src += 4 * src_stride;
513 dst += 4 * dst_stride;
514 h -= 4;
515 } while (h != 0);
516 } else {
517 do {
518 uint8x8_t t0, t1, t2;
519 load_u8_8x3(src, src_stride, &t0, &t1, &t2);
520
521 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
522 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
523 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
524
525 int height = h;
526 const uint8_t *s = src + 3 * src_stride;
527 uint8_t *d = dst;
528
529 do {
530 uint8x8_t t3;
531 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
532
533 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
534 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
535 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
536 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
537
538 uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
539 uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
540 uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
541 uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
542
543 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
544
545 s0 = s4;
546 s1 = s5;
547 s2 = s6;
548
549 s += 4 * src_stride;
550 d += 4 * dst_stride;
551 height -= 4;
552 } while (height != 0);
553 src += 8;
554 dst += 8;
555 w -= 8;
556 } while (w != 0);
557 }
558 }
559
convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter_0_7)560 static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
561 const int16x4_t s2, const int16x4_t s3,
562 const int16x4_t s4, const int16x4_t s5,
563 const int16x8_t y_filter_0_7) {
564 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
565 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
566
567 // Filter values at indices 0 and 7 are 0.
568 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
569 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
570 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
571 sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
572 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
573 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
574
575 return sum;
576 }
577
convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filters)578 static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
579 const int16x8_t s2, const int16x8_t s3,
580 const int16x8_t s4, const int16x8_t s5,
581 const int16x8_t y_filters) {
582 const int16x4_t y_filter_lo = vget_low_s16(y_filters);
583 const int16x4_t y_filter_hi = vget_high_s16(y_filters);
584
585 // Filter values at indices 0 and 7 are 0.
586 int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
587 sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
588 sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
589 sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
590 sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
591 sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
592 // We halved the convolution filter values so -1 from the right shift.
593 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
594 }
595
convolve_y_sr_6tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)596 static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
597 int src_stride, uint8_t *dst_ptr,
598 const int dst_stride, int w, int h,
599 const int16x8_t y_filter) {
600 if (w <= 4) {
601 uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
602 uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
603 uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
604 uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
605 uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
606
607 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
608 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
609 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
610 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
611 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
612
613 src_ptr += 5 * src_stride;
614
615 do {
616 #if AOM_ARCH_AARCH64
617 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
618 uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
619 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
620 uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
621
622 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
623 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
624 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
625 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
626
627 int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
628 int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
629 int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
630 int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
631
632 // We halved the convolution filter values so -1 from the right shift.
633 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
634 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
635
636 store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
637 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
638
639 s0 = s4;
640 s1 = s5;
641 s2 = s6;
642 s3 = s7;
643 s4 = s8;
644 src_ptr += 4 * src_stride;
645 dst_ptr += 4 * dst_stride;
646 h -= 4;
647 #else // !AOM_ARCH_AARCH64
648 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
649 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
650
651 int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
652 // We halved the convolution filter values so -1 from the right shift.
653 uint8x8_t d01 =
654 vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
655
656 store_u8_4x1(dst_ptr, d01);
657
658 s0 = s1;
659 s1 = s2;
660 s2 = s3;
661 s3 = s4;
662 s4 = s5;
663 src_ptr += src_stride;
664 dst_ptr += dst_stride;
665 h--;
666 #endif // AOM_ARCH_AARCH64
667 } while (h != 0);
668
669 } else {
670 do {
671 const uint8_t *s = src_ptr;
672 uint8_t *d = dst_ptr;
673 int height = h;
674
675 uint8x8_t t0, t1, t2, t3, t4;
676 load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
677
678 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
679 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
680 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
681 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
682 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
683
684 s += 5 * src_stride;
685
686 do {
687 #if AOM_ARCH_AARCH64
688 uint8x8_t t5, t6, t7, t8;
689 load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
690
691 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
692 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
693 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
694 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
695
696 uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
697 uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
698 uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
699 uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
700
701 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
702
703 s0 = s4;
704 s1 = s5;
705 s2 = s6;
706 s3 = s7;
707 s4 = s8;
708 s += 4 * src_stride;
709 d += 4 * dst_stride;
710 height -= 4;
711 #else // !AOM_ARCH_AARCH64
712 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
713
714 uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
715
716 vst1_u8(d, d0);
717
718 s0 = s1;
719 s1 = s2;
720 s2 = s3;
721 s3 = s4;
722 s4 = s5;
723 s += src_stride;
724 d += dst_stride;
725 height--;
726 #endif // AOM_ARCH_AARCH64
727 } while (height != 0);
728 src_ptr += 8;
729 dst_ptr += 8;
730 w -= 8;
731 } while (w != 0);
732 }
733 }
734
convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter)735 static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
736 const int16x4_t s2, const int16x4_t s3,
737 const int16x4_t s4, const int16x4_t s5,
738 const int16x4_t s6, const int16x4_t s7,
739 const int16x8_t filter) {
740 const int16x4_t filter_lo = vget_low_s16(filter);
741 const int16x4_t filter_hi = vget_high_s16(filter);
742
743 int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
744 sum = vmla_lane_s16(sum, s1, filter_lo, 1);
745 sum = vmla_lane_s16(sum, s2, filter_lo, 2);
746 sum = vmla_lane_s16(sum, s3, filter_lo, 3);
747 sum = vmla_lane_s16(sum, s4, filter_hi, 0);
748 sum = vmla_lane_s16(sum, s5, filter_hi, 1);
749 sum = vmla_lane_s16(sum, s6, filter_hi, 2);
750 sum = vmla_lane_s16(sum, s7, filter_hi, 3);
751
752 return sum;
753 }
754
convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter)755 static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
756 const int16x8_t s2, const int16x8_t s3,
757 const int16x8_t s4, const int16x8_t s5,
758 const int16x8_t s6, const int16x8_t s7,
759 const int16x8_t filter) {
760 const int16x4_t filter_lo = vget_low_s16(filter);
761 const int16x4_t filter_hi = vget_high_s16(filter);
762
763 int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
764 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
765 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
766 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
767 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
768 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
769 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
770 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
771
772 // We halved the convolution filter values so -1 from the right shift.
773 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
774 }
775
convolve_y_sr_8tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)776 static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
777 int src_stride, uint8_t *dst_ptr,
778 const int dst_stride, int w, int h,
779 const int16x8_t y_filter) {
780 if (w <= 4) {
781 uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
782 uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
783 uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
784 uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
785 uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
786 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
787 uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
788
789 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
790 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
791 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
792 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
793 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
794 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
795 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
796
797 src_ptr += 7 * src_stride;
798
799 do {
800 #if AOM_ARCH_AARCH64
801 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
802 uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
803 uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
804 uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
805
806 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
807 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
808 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
809 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
810
811 int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
812 int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
813 int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
814 int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
815
816 // We halved the convolution filter values so -1 from the right shift.
817 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
818 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
819
820 store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
821 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
822
823 s0 = s4;
824 s1 = s5;
825 s2 = s6;
826 s3 = s7;
827 s4 = s8;
828 s5 = s9;
829 s6 = s10;
830 src_ptr += 4 * src_stride;
831 dst_ptr += 4 * dst_stride;
832 h -= 4;
833 #else // !AOM_ARCH_AARCH64
834 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
835 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
836
837 int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
838 // We halved the convolution filter values so -1 from the right shift.
839 uint8x8_t d01 =
840 vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
841
842 store_u8_4x1(dst_ptr, d01);
843
844 s0 = s1;
845 s1 = s2;
846 s2 = s3;
847 s3 = s4;
848 s4 = s5;
849 s5 = s6;
850 s6 = s7;
851 src_ptr += src_stride;
852 dst_ptr += dst_stride;
853 h--;
854 #endif // AOM_ARCH_AARCH64
855 } while (h != 0);
856 } else {
857 do {
858 const uint8_t *s = src_ptr;
859 uint8_t *d = dst_ptr;
860 int height = h;
861
862 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
863 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
864
865 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
866 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
867 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
868 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
869 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
870 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
871 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
872
873 s += 7 * src_stride;
874
875 do {
876 #if AOM_ARCH_AARCH64
877 uint8x8_t t7, t8, t9, t10;
878 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
879
880 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
881 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
882 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
883 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
884
885 uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
886 uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
887 uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
888 uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
889
890 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
891
892 s0 = s4;
893 s1 = s5;
894 s2 = s6;
895 s3 = s7;
896 s4 = s8;
897 s5 = s9;
898 s6 = s10;
899 s += 4 * src_stride;
900 d += 4 * dst_stride;
901 height -= 4;
902 #else // !AOM_ARCH_AARCH64
903 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
904
905 uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
906
907 vst1_u8(d, d0);
908
909 s0 = s1;
910 s1 = s2;
911 s2 = s3;
912 s3 = s4;
913 s4 = s5;
914 s5 = s6;
915 s6 = s7;
916 s += src_stride;
917 d += dst_stride;
918 height--;
919 #endif // AOM_ARCH_AARCH64
920 } while (height != 0);
921 src_ptr += 8;
922 dst_ptr += 8;
923 w -= 8;
924 } while (w != 0);
925 }
926 }
927
convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)928 static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
929 const int16x4_t s2, const int16x4_t s3,
930 const int16x4_t s4, const int16x4_t s5,
931 const int16x4_t s6, const int16x4_t s7,
932 const int16x4_t s8, const int16x4_t s9,
933 const int16x4_t s10, const int16x4_t s11,
934 const int16x8_t y_filter_0_7,
935 const int16x4_t y_filter_8_11) {
936 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
937 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
938 int16x4_t sum;
939
940 sum = vmul_lane_s16(s0, y_filter_0_3, 0);
941 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
942 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
943 sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
944 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
945
946 sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
947 sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0);
948 sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1);
949 sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
950 sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
951
952 // Saturating addition is required for the largest filter taps to avoid
953 // overflow (while staying in 16-bit elements.)
954 sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
955 sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
956
957 return sum;
958 }
959
convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)960 static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
961 const int16x8_t s2, const int16x8_t s3,
962 const int16x8_t s4, const int16x8_t s5,
963 const int16x8_t s6, const int16x8_t s7,
964 const int16x8_t s8, const int16x8_t s9,
965 const int16x8_t s10, const int16x8_t s11,
966 const int16x8_t y_filter_0_7,
967 const int16x4_t y_filter_8_11) {
968 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
969 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
970 int16x8_t sum;
971
972 sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
973 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
974 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
975 sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
976 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
977
978 sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
979 sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0);
980 sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1);
981 sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
982 sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
983
984 // Saturating addition is required for the largest filter taps to avoid
985 // overflow (while staying in 16-bit elements.)
986 sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
987 sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
988
989 return vqrshrun_n_s16(sum, FILTER_BITS);
990 }
991
convolve_y_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)992 static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
993 int src_stride, uint8_t *dst_ptr,
994 int dst_stride, int w, int h,
995 const int16_t *y_filter_ptr) {
996 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
997 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
998
999 if (w <= 4) {
1000 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1001 load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
1002 &t8, &t9, &t10);
1003 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1004 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1005 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1006 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1007 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
1008 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
1009 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
1010 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
1011 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
1012 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
1013 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
1014
1015 src_ptr += 11 * src_stride;
1016
1017 do {
1018 uint8x8_t t11, t12, t13, t14;
1019 load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
1020
1021 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
1022 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
1023 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
1024 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
1025
1026 int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1027 s11, y_filter_0_7, y_filter_8_11);
1028 int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1029 s11, s12, y_filter_0_7, y_filter_8_11);
1030 int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1031 s12, s13, y_filter_0_7, y_filter_8_11);
1032 int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1033 s13, s14, y_filter_0_7, y_filter_8_11);
1034
1035 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
1036 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
1037
1038 store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
1039 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
1040
1041 s0 = s4;
1042 s1 = s5;
1043 s2 = s6;
1044 s3 = s7;
1045 s4 = s8;
1046 s5 = s9;
1047 s6 = s10;
1048 s7 = s11;
1049 s8 = s12;
1050 s9 = s13;
1051 s10 = s14;
1052 src_ptr += 4 * src_stride;
1053 dst_ptr += 4 * dst_stride;
1054 h -= 4;
1055 } while (h != 0);
1056
1057 } else {
1058 do {
1059 const uint8_t *s = src_ptr;
1060 uint8_t *d = dst_ptr;
1061 int height = h;
1062
1063 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1064 load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
1065 &t9, &t10);
1066 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1067 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1068 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1069 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1070 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1071 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1072 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1073 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
1074 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
1075 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
1076 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
1077
1078 s += 11 * src_stride;
1079
1080 do {
1081 uint8x8_t t11, t12, t13, t14;
1082 load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
1083
1084 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
1085 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
1086 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
1087 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
1088
1089 uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
1090 s10, s11, y_filter_0_7, y_filter_8_11);
1091 uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1092 s11, s12, y_filter_0_7, y_filter_8_11);
1093 uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1094 s12, s13, y_filter_0_7, y_filter_8_11);
1095 uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1096 s13, s14, y_filter_0_7, y_filter_8_11);
1097
1098 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1099
1100 s0 = s4;
1101 s1 = s5;
1102 s2 = s6;
1103 s3 = s7;
1104 s4 = s8;
1105 s5 = s9;
1106 s6 = s10;
1107 s7 = s11;
1108 s8 = s12;
1109 s9 = s13;
1110 s10 = s14;
1111 s += 4 * src_stride;
1112 d += 4 * dst_stride;
1113 height -= 4;
1114 } while (height != 0);
1115 src_ptr += 8;
1116 dst_ptr += 8;
1117 w -= 8;
1118 } while (w != 0);
1119 }
1120 }
1121
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1122 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1123 int dst_stride, int w, int h,
1124 const InterpFilterParams *filter_params_y,
1125 const int subpel_y_qn) {
1126 if (w == 2 || h == 2) {
1127 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
1128 subpel_y_qn);
1129 return;
1130 }
1131
1132 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1133 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1134 const int vert_offset = clamped_y_taps / 2 - 1;
1135
1136 src -= vert_offset * src_stride;
1137
1138 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1139 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1140
1141 if (y_filter_taps > 8) {
1142 convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
1143 y_filter_ptr);
1144 return;
1145 }
1146
1147 // Filter values are even so halve to reduce precision requirements.
1148 const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
1149
1150 if (y_filter_taps <= 4) {
1151 convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
1152 y_filter_ptr);
1153 } else if (y_filter_taps == 6) {
1154 convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1155 } else {
1156 convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1157 }
1158 }
1159
convolve12_4_2d_h(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)1160 static inline int16x4_t convolve12_4_2d_h(
1161 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1162 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1163 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1164 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1165 const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
1166 const int32x4_t horiz_const) {
1167 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1168 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1169
1170 int32x4_t sum = horiz_const;
1171 sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
1172 sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
1173 sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
1174 sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
1175 sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
1176 sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
1177 sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
1178 sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
1179 sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
1180 sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
1181 sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
1182 sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
1183
1184 return vshrn_n_s32(sum, ROUND0_BITS);
1185 }
1186
convolve_2d_sr_horiz_12tap_neon(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11)1187 static inline void convolve_2d_sr_horiz_12tap_neon(
1188 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
1189 const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
1190 const int16x4_t x_filter_8_11) {
1191 const int bd = 8;
1192 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
1193 // which are generally faster than rounding shifts on modern CPUs.
1194 const int32x4_t horiz_const =
1195 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1196
1197 #if AOM_ARCH_AARCH64
1198 do {
1199 const uint8_t *s = src_ptr;
1200 int16_t *d = dst_ptr;
1201 int width = w;
1202
1203 uint8x8_t t0, t1, t2, t3;
1204 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1205 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1206
1207 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1208 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1209 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1210 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1211 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1212 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1213 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1214 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1215
1216 load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
1217 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1218
1219 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1220 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1221 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1222
1223 s += 11;
1224
1225 do {
1226 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1227 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1228
1229 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1230 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1231 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1232 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1233
1234 int16x4_t d0 =
1235 convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1236 x_filter_0_7, x_filter_8_11, horiz_const);
1237 int16x4_t d1 =
1238 convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1239 x_filter_0_7, x_filter_8_11, horiz_const);
1240 int16x4_t d2 =
1241 convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
1242 x_filter_0_7, x_filter_8_11, horiz_const);
1243 int16x4_t d3 =
1244 convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
1245 x_filter_0_7, x_filter_8_11, horiz_const);
1246
1247 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
1248 store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
1249
1250 s0 = s4;
1251 s1 = s5;
1252 s2 = s6;
1253 s3 = s7;
1254 s4 = s8;
1255 s5 = s9;
1256 s6 = s10;
1257 s7 = s11;
1258 s8 = s12;
1259 s9 = s13;
1260 s10 = s14;
1261 s += 4;
1262 d += 4;
1263 width -= 4;
1264 } while (width != 0);
1265 src_ptr += 4 * src_stride;
1266 dst_ptr += 4 * dst_stride;
1267 h -= 4;
1268 } while (h > 4);
1269 #endif // AOM_ARCH_AARCH64
1270
1271 do {
1272 const uint8_t *s = src_ptr;
1273 int16_t *d = dst_ptr;
1274 int width = w;
1275
1276 do {
1277 uint8x16_t t0 = vld1q_u8(s);
1278 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
1279 int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
1280
1281 int16x4_t s0 = vget_low_s16(tt0);
1282 int16x4_t s4 = vget_high_s16(tt0);
1283 int16x4_t s8 = vget_low_s16(tt1);
1284 int16x4_t s12 = vget_high_s16(tt1);
1285
1286 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
1287 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
1288 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
1289 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
1290 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
1291 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
1292 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
1293 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
1294 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
1295
1296 int16x4_t d0 =
1297 convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1298 x_filter_0_7, x_filter_8_11, horiz_const);
1299 vst1_s16(d, d0);
1300
1301 s += 4;
1302 d += 4;
1303 width -= 4;
1304 } while (width != 0);
1305 src_ptr += src_stride;
1306 dst_ptr += dst_stride;
1307 } while (--h != 0);
1308 }
1309
convolve4_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int16x8_t horiz_const)1310 static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1311 const int16x8_t s2, const int16x8_t s3,
1312 const int16x4_t filter,
1313 const int16x8_t horiz_const) {
1314 int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
1315 sum = vmlaq_lane_s16(sum, s1, filter, 1);
1316 sum = vmlaq_lane_s16(sum, s2, filter, 2);
1317 sum = vmlaq_lane_s16(sum, s3, filter, 3);
1318 // We halved the filter values so -1 from right shift.
1319 return vshrq_n_s16(sum, ROUND0_BITS - 1);
1320 }
1321
convolve_2d_sr_horiz_4tap_neon(const uint8_t * src,ptrdiff_t src_stride,int16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16_t * filter_x)1322 static inline void convolve_2d_sr_horiz_4tap_neon(
1323 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
1324 ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
1325 const int bd = 8;
1326 // All filter values are even, halve to reduce intermediate precision
1327 // requirements.
1328 const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
1329
1330 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1331 // shifts - which are generally faster than rounding shifts on modern CPUs.
1332 // (The extra -1 is needed because we halved the filter values.)
1333 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1334 (1 << ((ROUND0_BITS - 1) - 1)));
1335
1336 if (w == 4) {
1337 do {
1338 uint8x8_t t01[4];
1339 t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
1340 t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
1341 t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
1342 t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
1343
1344 int16x8_t s01[4];
1345 s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
1346 s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
1347 s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
1348 s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
1349
1350 int16x8_t d01 =
1351 convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
1352
1353 store_s16x4_strided_x2(dst, (int)dst_stride, d01);
1354
1355 src += 2 * src_stride;
1356 dst += 2 * dst_stride;
1357 h -= 2;
1358 } while (h > 0);
1359 } else {
1360 do {
1361 int width = w;
1362 const uint8_t *s = src;
1363 int16_t *d = dst;
1364
1365 do {
1366 uint8x8_t t0[4], t1[4];
1367 load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1368 load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
1369
1370 int16x8_t s0[4];
1371 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1372 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1373 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1374 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1375
1376 int16x8_t s1[4];
1377 s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
1378 s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
1379 s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
1380 s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
1381
1382 int16x8_t d0 =
1383 convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1384 int16x8_t d1 =
1385 convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
1386
1387 store_s16_8x2(d, dst_stride, d0, d1);
1388
1389 s += 8;
1390 d += 8;
1391 width -= 8;
1392 } while (width != 0);
1393 src += 2 * src_stride;
1394 dst += 2 * dst_stride;
1395 h -= 2;
1396 } while (h > 2);
1397
1398 do {
1399 const uint8_t *s = src;
1400 int16_t *d = dst;
1401 int width = w;
1402
1403 do {
1404 uint8x8_t t0[4];
1405 load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1406
1407 int16x8_t s0[4];
1408 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1409 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1410 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1411 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1412
1413 int16x8_t d0 =
1414 convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1415
1416 vst1q_s16(d, d0);
1417
1418 s += 8;
1419 d += 8;
1420 width -= 8;
1421 } while (width != 0);
1422 src += src_stride;
1423 dst += dst_stride;
1424 } while (--h != 0);
1425 }
1426 }
1427
convolve8_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)1428 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1429 const int16x8_t s2, const int16x8_t s3,
1430 const int16x8_t s4, const int16x8_t s5,
1431 const int16x8_t s6, const int16x8_t s7,
1432 const int16x8_t filter,
1433 const int16x8_t horiz_const) {
1434 const int16x4_t filter_lo = vget_low_s16(filter);
1435 const int16x4_t filter_hi = vget_high_s16(filter);
1436
1437 int16x8_t sum = horiz_const;
1438 sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
1439 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
1440 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
1441 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
1442 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
1443 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
1444 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
1445 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
1446
1447 // We halved the convolution filter values so -1 from the right shift.
1448 return vshrq_n_s16(sum, ROUND0_BITS - 1);
1449 }
1450
convolve_2d_sr_horiz_8tap_neon(const uint8_t * src,int src_stride,int16_t * im_block,int im_stride,int w,int im_h,const int16_t * x_filter_ptr)1451 static inline void convolve_2d_sr_horiz_8tap_neon(
1452 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
1453 int im_h, const int16_t *x_filter_ptr) {
1454 const int bd = 8;
1455
1456 const uint8_t *src_ptr = src;
1457 int16_t *dst_ptr = im_block;
1458 int dst_stride = im_stride;
1459 int height = im_h;
1460
1461 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1462 // shifts - which are generally faster than rounding shifts on modern CPUs.
1463 // (The extra -1 is needed because we halved the filter values.)
1464 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1465 (1 << ((ROUND0_BITS - 1) - 1)));
1466 // Filter values are even, so halve to reduce intermediate precision reqs.
1467 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
1468
1469 #if AOM_ARCH_AARCH64
1470 while (height > 8) {
1471 const uint8_t *s = src_ptr;
1472 int16_t *d = dst_ptr;
1473 int width = w;
1474
1475 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
1476 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1477 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1478
1479 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1480 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1481 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1482 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1483 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1484 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1485 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1486
1487 s += 7;
1488
1489 do {
1490 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1491
1492 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1493
1494 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1495 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1496 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1497 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1498 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1499 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1500 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1501 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1502
1503 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1504 horiz_const);
1505 int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
1506 horiz_const);
1507 int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
1508 horiz_const);
1509 int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
1510 horiz_const);
1511 int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
1512 x_filter, horiz_const);
1513 int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
1514 x_filter, horiz_const);
1515 int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
1516 x_filter, horiz_const);
1517 int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
1518 x_filter, horiz_const);
1519
1520 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
1521
1522 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1523
1524 s0 = s8;
1525 s1 = s9;
1526 s2 = s10;
1527 s3 = s11;
1528 s4 = s12;
1529 s5 = s13;
1530 s6 = s14;
1531 s += 8;
1532 d += 8;
1533 width -= 8;
1534 } while (width != 0);
1535 src_ptr += 8 * src_stride;
1536 dst_ptr += 8 * dst_stride;
1537 height -= 8;
1538 }
1539 #endif // AOM_ARCH_AARCH64
1540
1541 do {
1542 const uint8_t *s = src_ptr;
1543 int16_t *d = dst_ptr;
1544 int width = w;
1545
1546 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
1547 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1548
1549 do {
1550 uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
1551 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1552
1553 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
1554 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
1555 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
1556 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
1557 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
1558 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
1559 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
1560
1561 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1562 horiz_const);
1563
1564 vst1q_s16(d, d0);
1565
1566 s0 = s8;
1567 s += 8;
1568 d += 8;
1569 width -= 8;
1570 } while (width != 0);
1571 src_ptr += src_stride;
1572 dst_ptr += dst_stride;
1573 } while (--height != 0);
1574 }
1575
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1576 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1577 int dst_stride, int w, int h,
1578 const InterpFilterParams *filter_params_x,
1579 const InterpFilterParams *filter_params_y,
1580 const int subpel_x_qn, const int subpel_y_qn,
1581 ConvolveParams *conv_params) {
1582 if (w == 2 || h == 2) {
1583 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1584 filter_params_x, filter_params_y, subpel_x_qn,
1585 subpel_y_qn, conv_params);
1586 return;
1587 }
1588
1589 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1590 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1591 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1592 const int im_h = h + clamped_y_taps - 1;
1593 const int im_stride = MAX_SB_SIZE;
1594 const int vert_offset = clamped_y_taps / 2 - 1;
1595 const int horiz_offset = filter_params_x->taps / 2 - 1;
1596 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1597
1598 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1599 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1600 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1601 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1602
1603 if (filter_params_x->taps > 8) {
1604 DECLARE_ALIGNED(16, int16_t,
1605 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1606
1607 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1608 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1609 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1610 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1611
1612 convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
1613 im_h, x_filter_0_7, x_filter_8_11);
1614
1615 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1616 y_filter_0_7, y_filter_8_11);
1617 } else {
1618 DECLARE_ALIGNED(16, int16_t,
1619 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
1620
1621 if (x_filter_taps <= 4) {
1622 convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
1623 im_stride, w, im_h, x_filter_ptr);
1624 } else {
1625 convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
1626 w, im_h, x_filter_ptr);
1627 }
1628
1629 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1630
1631 if (clamped_y_taps <= 4) {
1632 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1633 y_filter_ptr);
1634 } else if (clamped_y_taps == 6) {
1635 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1636 y_filter);
1637 } else {
1638 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1639 y_filter);
1640 }
1641 }
1642 }
1643
av1_convolve_x_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1644 void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
1645 uint8_t *dst, int dst_stride, int w, int h,
1646 const InterpFilterParams *filter_params_x,
1647 const int subpel_x_qn,
1648 ConvolveParams *conv_params) {
1649 assert(subpel_x_qn == 8);
1650 assert(filter_params_x->taps == 2);
1651 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1652 (void)filter_params_x;
1653 (void)subpel_x_qn;
1654 (void)conv_params;
1655
1656 if (w <= 4) {
1657 do {
1658 uint8x8_t s0_0 = vld1_u8(src);
1659 uint8x8_t s0_1 = vld1_u8(src + 1);
1660 uint8x8_t s1_0 = vld1_u8(src + src_stride);
1661 uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1662
1663 uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1664 uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1665
1666 if (w == 2) {
1667 store_u8_2x1(dst + 0 * dst_stride, d0);
1668 store_u8_2x1(dst + 1 * dst_stride, d1);
1669 } else {
1670 store_u8_4x1(dst + 0 * dst_stride, d0);
1671 store_u8_4x1(dst + 1 * dst_stride, d1);
1672 }
1673
1674 src += 2 * src_stride;
1675 dst += 2 * dst_stride;
1676 h -= 2;
1677 } while (h != 0);
1678 } else if (w == 8) {
1679 do {
1680 uint8x8_t s0_0 = vld1_u8(src);
1681 uint8x8_t s0_1 = vld1_u8(src + 1);
1682 uint8x8_t s1_0 = vld1_u8(src + src_stride);
1683 uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1684
1685 uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1686 uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1687
1688 vst1_u8(dst, d0);
1689 vst1_u8(dst + dst_stride, d1);
1690
1691 src += 2 * src_stride;
1692 dst += 2 * dst_stride;
1693 h -= 2;
1694 } while (h != 0);
1695 } else {
1696 do {
1697 const uint8_t *src_ptr = src;
1698 uint8_t *dst_ptr = dst;
1699 int width = w;
1700
1701 do {
1702 uint8x16_t s0 = vld1q_u8(src_ptr);
1703 uint8x16_t s1 = vld1q_u8(src_ptr + 1);
1704
1705 uint8x16_t d0 = vrhaddq_u8(s0, s1);
1706
1707 vst1q_u8(dst_ptr, d0);
1708
1709 src_ptr += 16;
1710 dst_ptr += 16;
1711 width -= 16;
1712 } while (width != 0);
1713 src += src_stride;
1714 dst += dst_stride;
1715 } while (--h != 0);
1716 }
1717 }
1718
av1_convolve_y_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1719 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
1720 uint8_t *dst, int dst_stride, int w, int h,
1721 const InterpFilterParams *filter_params_y,
1722 const int subpel_y_qn) {
1723 assert(subpel_y_qn == 8);
1724 assert(filter_params_y->taps == 2);
1725 (void)filter_params_y;
1726 (void)subpel_y_qn;
1727
1728 if (w <= 4) {
1729 do {
1730 uint8x8_t s0 = load_unaligned_u8_4x1(src);
1731 uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
1732 uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
1733
1734 uint8x8_t d0 = vrhadd_u8(s0, s1);
1735 uint8x8_t d1 = vrhadd_u8(s1, s2);
1736
1737 if (w == 2) {
1738 store_u8_2x1(dst + 0 * dst_stride, d0);
1739 store_u8_2x1(dst + 1 * dst_stride, d1);
1740 } else {
1741 store_u8_4x1(dst + 0 * dst_stride, d0);
1742 store_u8_4x1(dst + 1 * dst_stride, d1);
1743 }
1744
1745 src += 2 * src_stride;
1746 dst += 2 * dst_stride;
1747 h -= 2;
1748 } while (h != 0);
1749 } else if (w == 8) {
1750 do {
1751 uint8x8_t s0 = vld1_u8(src);
1752 uint8x8_t s1 = vld1_u8(src + src_stride);
1753 uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
1754
1755 uint8x8_t d0 = vrhadd_u8(s0, s1);
1756 uint8x8_t d1 = vrhadd_u8(s1, s2);
1757
1758 vst1_u8(dst, d0);
1759 vst1_u8(dst + dst_stride, d1);
1760
1761 src += 2 * src_stride;
1762 dst += 2 * dst_stride;
1763 h -= 2;
1764 } while (h != 0);
1765 } else {
1766 do {
1767 const uint8_t *src_ptr = src;
1768 uint8_t *dst_ptr = dst;
1769 int height = h;
1770
1771 do {
1772 uint8x16_t s0 = vld1q_u8(src_ptr);
1773 uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
1774
1775 uint8x16_t d0 = vrhaddq_u8(s0, s1);
1776
1777 vst1q_u8(dst_ptr, d0);
1778
1779 src_ptr += src_stride;
1780 dst_ptr += dst_stride;
1781 } while (--height != 0);
1782 src += 16;
1783 dst += 16;
1784 w -= 16;
1785 } while (w != 0);
1786 }
1787 }
1788
av1_convolve_2d_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1789 void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
1790 uint8_t *dst, int dst_stride, int w, int h,
1791 const InterpFilterParams *filter_params_x,
1792 const InterpFilterParams *filter_params_y,
1793 const int subpel_x_qn,
1794 const int subpel_y_qn,
1795 ConvolveParams *conv_params) {
1796 assert(subpel_x_qn == 8);
1797 assert(subpel_y_qn == 8);
1798 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1799 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1800 (void)filter_params_x;
1801 (void)subpel_x_qn;
1802 (void)filter_params_y;
1803 (void)subpel_y_qn;
1804 (void)conv_params;
1805
1806 uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1807 int im_h = h + 1;
1808 int im_stride = w;
1809 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
1810
1811 uint16_t *im = im_block;
1812
1813 // Horizontal filter.
1814 if (w <= 4) {
1815 do {
1816 uint8x8_t s0 = vld1_u8(src);
1817 uint8x8_t s1 = vld1_u8(src + 1);
1818
1819 uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
1820
1821 // Safe to store the whole vector, the im buffer is big enough.
1822 vst1_u16(im, sum);
1823
1824 src += src_stride;
1825 im += im_stride;
1826 } while (--im_h != 0);
1827 } else {
1828 do {
1829 const uint8_t *src_ptr = src;
1830 uint16_t *im_ptr = im;
1831 int width = w;
1832
1833 do {
1834 uint8x8_t s0 = vld1_u8(src_ptr);
1835 uint8x8_t s1 = vld1_u8(src_ptr + 1);
1836
1837 uint16x8_t sum = vaddl_u8(s0, s1);
1838
1839 vst1q_u16(im_ptr, sum);
1840
1841 src_ptr += 8;
1842 im_ptr += 8;
1843 width -= 8;
1844 } while (width != 0);
1845 src += src_stride;
1846 im += im_stride;
1847 } while (--im_h != 0);
1848 }
1849
1850 im = im_block;
1851
1852 // Vertical filter.
1853 if (w <= 4) {
1854 do {
1855 uint16x4_t s0 = vld1_u16(im);
1856 uint16x4_t s1 = vld1_u16(im + im_stride);
1857 uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
1858
1859 uint16x4_t sum0 = vadd_u16(s0, s1);
1860 uint16x4_t sum1 = vadd_u16(s1, s2);
1861
1862 uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
1863 uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
1864
1865 if (w == 2) {
1866 store_u8_2x1(dst + 0 * dst_stride, d0);
1867 store_u8_2x1(dst + 1 * dst_stride, d1);
1868 } else {
1869 store_u8_4x1(dst + 0 * dst_stride, d0);
1870 store_u8_4x1(dst + 1 * dst_stride, d1);
1871 }
1872
1873 im += 2 * im_stride;
1874 dst += 2 * dst_stride;
1875 h -= 2;
1876 } while (h != 0);
1877 } else {
1878 do {
1879 uint16_t *im_ptr = im;
1880 uint8_t *dst_ptr = dst;
1881 int height = h;
1882
1883 do {
1884 uint16x8_t s0 = vld1q_u16(im_ptr);
1885 uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
1886
1887 uint16x8_t sum = vaddq_u16(s0, s1);
1888 uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
1889
1890 vst1_u8(dst_ptr, d0);
1891
1892 im_ptr += im_stride;
1893 dst_ptr += dst_stride;
1894 } while (--height != 0);
1895 im += 8;
1896 dst += 8;
1897 w -= 8;
1898 } while (w != 0);
1899 }
1900 }
1901