1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "aom_dsp/arm/transpose_neon.h"
17 #include "av1/common/arm/compound_convolve_neon.h"
18 #include "config/aom_config.h"
19 #include "config/av1_rtcd.h"
20
convolve4_4_2d_h(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t x_filter,const int16x4_t horiz_const)21 static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
22 const int16x4_t s2, const int16x4_t s3,
23 const int16x4_t x_filter,
24 const int16x4_t horiz_const) {
25 int16x4_t sum = horiz_const;
26 sum = vmla_lane_s16(sum, s0, x_filter, 0);
27 sum = vmla_lane_s16(sum, s1, x_filter, 1);
28 sum = vmla_lane_s16(sum, s2, x_filter, 2);
29 sum = vmla_lane_s16(sum, s3, x_filter, 3);
30
31 // We halved the convolution filter values so -1 from the right shift.
32 return vshr_n_s16(sum, ROUND0_BITS - 1);
33 }
34
convolve8_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t x_filter,const int16x8_t horiz_const)35 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
36 const int16x8_t s2, const int16x8_t s3,
37 const int16x8_t s4, const int16x8_t s5,
38 const int16x8_t s6, const int16x8_t s7,
39 const int16x8_t x_filter,
40 const int16x8_t horiz_const) {
41 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
42 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
43
44 int16x8_t sum = horiz_const;
45 sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
46 sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
47 sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
48 sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
49 sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
50 sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
51 sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
52 sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
53
54 // We halved the convolution filter values so -1 from the right shift.
55 return vshrq_n_s16(sum, ROUND0_BITS - 1);
56 }
57
dist_wtd_convolve_2d_horiz_neon(const uint8_t * src,int src_stride,int16_t * im_block,const int im_stride,const int16_t * x_filter_ptr,const int im_h,int w)58 static inline void dist_wtd_convolve_2d_horiz_neon(
59 const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
60 const int16_t *x_filter_ptr, const int im_h, int w) {
61 const int bd = 8;
62
63 const uint8_t *src_ptr = src;
64 int16_t *dst_ptr = im_block;
65 int dst_stride = im_stride;
66 int height = im_h;
67
68 if (w == 4) {
69 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
70 // shifts - which are generally faster than rounding shifts on modern CPUs.
71 // (The extra -1 is needed because we halved the filter values.)
72 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
73 (1 << ((ROUND0_BITS - 1) - 1)));
74 // 4-tap filters are used for blocks having width <= 4.
75 // Filter values are even, so halve to reduce intermediate precision reqs.
76 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
77
78 src_ptr += 2;
79
80 do {
81 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
82 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
83 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
84
85 __builtin_prefetch(dst_ptr);
86
87 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
88 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
89 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
90
91 int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
92
93 vst1_s16(dst_ptr, d0);
94
95 src_ptr += src_stride;
96 dst_ptr += dst_stride;
97 } while (--height != 0);
98 } else {
99 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
100 // shifts - which are generally faster than rounding shifts on modern CPUs.
101 // (The extra -1 is needed because we halved the filter values.)
102 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
103 (1 << ((ROUND0_BITS - 1) - 1)));
104 // Filter values are even, so halve to reduce intermediate precision reqs.
105 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
106
107 #if AOM_ARCH_AARCH64
108 do {
109 const uint8_t *s;
110 int16_t *d = dst_ptr;
111 int width = w;
112
113 __builtin_prefetch(src_ptr + 0 * src_stride);
114 __builtin_prefetch(src_ptr + 1 * src_stride);
115 __builtin_prefetch(src_ptr + 2 * src_stride);
116 __builtin_prefetch(src_ptr + 3 * src_stride);
117 __builtin_prefetch(src_ptr + 4 * src_stride);
118 __builtin_prefetch(src_ptr + 5 * src_stride);
119 __builtin_prefetch(src_ptr + 6 * src_stride);
120 __builtin_prefetch(src_ptr + 7 * src_stride);
121
122 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
123 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
124 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
125
126 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
127 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
128 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
129 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
130 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
131 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
132 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
133
134 s = src_ptr + 7;
135
136 __builtin_prefetch(dst_ptr + 0 * dst_stride);
137 __builtin_prefetch(dst_ptr + 1 * dst_stride);
138 __builtin_prefetch(dst_ptr + 2 * dst_stride);
139 __builtin_prefetch(dst_ptr + 3 * dst_stride);
140 __builtin_prefetch(dst_ptr + 4 * dst_stride);
141 __builtin_prefetch(dst_ptr + 5 * dst_stride);
142 __builtin_prefetch(dst_ptr + 6 * dst_stride);
143 __builtin_prefetch(dst_ptr + 7 * dst_stride);
144
145 do {
146 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
147 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
148
149 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
150 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
151 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
152 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
153 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
154 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
155 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
156 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
157
158 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
159 x_filter, horiz_const);
160 int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
161 x_filter, horiz_const);
162 int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
163 x_filter, horiz_const);
164 int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
165 x_filter, horiz_const);
166 int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
167 x_filter, horiz_const);
168 int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
169 x_filter, horiz_const);
170 int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
171 x_filter, horiz_const);
172 int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
173 x_filter, horiz_const);
174
175 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
176 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
177
178 s0 = s8;
179 s1 = s9;
180 s2 = s10;
181 s3 = s11;
182 s4 = s12;
183 s5 = s13;
184 s6 = s14;
185 s += 8;
186 d += 8;
187 width -= 8;
188 } while (width > 0);
189 src_ptr += 8 * src_stride;
190 dst_ptr += 8 * dst_stride;
191 height -= 8;
192 } while (height > 8);
193 #endif // AOM_ARCH_AARCH64
194
195 do {
196 const uint8_t *s;
197 int16_t *d = dst_ptr;
198 int width = w;
199
200 uint8x8_t t0 = vld1_u8(src_ptr);
201 int16x8_t s0 =
202 vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
203
204 s = src_ptr + 8;
205 __builtin_prefetch(dst_ptr);
206
207 do {
208 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
209 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
210
211 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
212 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
213 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
214 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
215 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
216 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
217 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
218
219 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
220 x_filter, horiz_const);
221 vst1q_s16(d, d0);
222
223 s0 = s8;
224 s += 8;
225 d += 8;
226 width -= 8;
227 } while (width > 0);
228 src_ptr += src_stride;
229 dst_ptr += dst_stride;
230 } while (--height != 0);
231 }
232 }
233
av1_dist_wtd_convolve_2d_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)234 void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
235 uint8_t *dst8, int dst8_stride, int w, int h,
236 const InterpFilterParams *filter_params_x,
237 const InterpFilterParams *filter_params_y,
238 const int subpel_x_qn, const int subpel_y_qn,
239 ConvolveParams *conv_params) {
240 assert(w % 4 == 0);
241 assert(h % 4 == 0);
242
243 DECLARE_ALIGNED(16, int16_t,
244 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
245
246 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
247 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
248
249 const int im_h = h + clamped_y_taps - 1;
250 const int im_stride = MAX_SB_SIZE;
251 const int vert_offset = clamped_y_taps / 2 - 1;
252 const int horiz_offset = filter_params_x->taps / 2 - 1;
253 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
254 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
255 filter_params_x, subpel_x_qn & SUBPEL_MASK);
256 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
257 filter_params_y, subpel_y_qn & SUBPEL_MASK);
258
259 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
260
261 dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
262 x_filter_ptr, im_h, w);
263
264 if (clamped_y_taps == 6) {
265 if (conv_params->do_average) {
266 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
267 dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
268 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
269 w);
270 } else {
271 dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
272 dst8_stride, conv_params,
273 y_filter, h, w);
274 }
275 } else {
276 dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
277 y_filter, h, w);
278 }
279 } else {
280 if (conv_params->do_average) {
281 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
282 dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
283 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
284 w);
285 } else {
286 dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
287 dst8_stride, conv_params,
288 y_filter, h, w);
289 }
290 } else {
291 dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
292 y_filter, h, w);
293 }
294 }
295 }
296
dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)297 static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
298 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
299 int h, ConvolveParams *conv_params) {
300 assert(w % 4 == 0);
301 assert(h % 4 == 0);
302
303 const int bd = 8;
304 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
305 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
306 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
307 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
308 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
309
310 const uint16_t fwd_offset = conv_params->fwd_offset;
311 const uint16_t bck_offset = conv_params->bck_offset;
312
313 CONV_BUF_TYPE *dst = conv_params->dst;
314 const int dst_stride = conv_params->dst_stride;
315 int height = h;
316
317 if (w == 4) {
318 do {
319 uint8x8_t s0, s1, s2, s3;
320 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
321
322 uint16x4_t d0 =
323 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
324 uint16x4_t d1 =
325 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
326 uint16x4_t d2 =
327 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
328 uint16x4_t d3 =
329 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
330
331 uint16x4_t dd0, dd1, dd2, dd3;
332 load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
333
334 uint8x8_t d01, d23;
335 compute_dist_wtd_avg_4x4(
336 dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
337 vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
338
339 store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
340 store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
341
342 src += 4 * src_stride;
343 dst += 4 * dst_stride;
344 dst8 += 4 * dst8_stride;
345 height -= 4;
346 } while (height != 0);
347 } else {
348 do {
349 const uint8_t *s = src;
350 CONV_BUF_TYPE *d = dst;
351 uint8_t *d_u8 = dst8;
352 int width = w;
353
354 do {
355 uint8x8_t s0, s1, s2, s3;
356 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
357
358 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
359 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
360 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
361 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
362
363 uint16x8_t dd0, dd1, dd2, dd3;
364 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
365
366 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
367 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
368 bck_offset,
369 vreinterpretq_s16_u16(round_offset_vec),
370 &d0_u8, &d1_u8, &d2_u8, &d3_u8);
371
372 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
373
374 s += 8;
375 d += 8;
376 d_u8 += 8;
377 width -= 8;
378 } while (width != 0);
379 src += 4 * src_stride;
380 dst += 4 * dst_stride;
381 dst8 += 4 * dst8_stride;
382 height -= 4;
383 } while (height != 0);
384 }
385 }
386
dist_wtd_convolve_2d_copy_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)387 static inline void dist_wtd_convolve_2d_copy_avg_neon(
388 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
389 int h, ConvolveParams *conv_params) {
390 assert(w % 4 == 0);
391 assert(h % 4 == 0);
392
393 const int bd = 8;
394 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
395 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
396 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
397 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
398 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
399
400 CONV_BUF_TYPE *dst = conv_params->dst;
401 const int dst_stride = conv_params->dst_stride;
402 int height = h;
403
404 if (w == 4) {
405 do {
406 uint8x8_t s0, s1, s2, s3;
407 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
408
409 uint16x4_t d0 =
410 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
411 uint16x4_t d1 =
412 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
413 uint16x4_t d2 =
414 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
415 uint16x4_t d3 =
416 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
417
418 uint16x4_t dd0, dd1, dd2, dd3;
419 load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
420
421 uint8x8_t d01, d23;
422 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
423 vreinterpretq_s16_u16(round_offset_vec), &d01,
424 &d23);
425
426 store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
427 store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
428
429 src += 4 * src_stride;
430 dst += 4 * dst_stride;
431 dst8 += 4 * dst8_stride;
432 height -= 4;
433 } while (height != 0);
434 } else {
435 do {
436 const uint8_t *s = src;
437 CONV_BUF_TYPE *d = dst;
438 uint8_t *d_u8 = dst8;
439 int width = w;
440
441 do {
442 uint8x8_t s0, s1, s2, s3;
443 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
444
445 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
446 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
447 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
448 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
449
450 uint16x8_t dd0, dd1, dd2, dd3;
451 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
452
453 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
454 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
455 vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
456 &d1_u8, &d2_u8, &d3_u8);
457
458 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
459
460 s += 8;
461 d += 8;
462 d_u8 += 8;
463 width -= 8;
464 } while (width != 0);
465 src += 4 * src_stride;
466 dst += 4 * dst_stride;
467 dst8 += 4 * dst8_stride;
468 height -= 4;
469 } while (height != 0);
470 }
471 }
472
dist_wtd_convolve_2d_copy_neon(const uint8_t * src,int src_stride,int w,int h,ConvolveParams * conv_params)473 static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
474 int src_stride, int w, int h,
475 ConvolveParams *conv_params) {
476 assert(w % 4 == 0);
477 assert(h % 4 == 0);
478
479 const int bd = 8;
480 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
481 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
482 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
483 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
484 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
485
486 CONV_BUF_TYPE *dst = conv_params->dst;
487 const int dst_stride = conv_params->dst_stride;
488 int height = h;
489
490 if (w == 4) {
491 do {
492 uint8x8_t s0, s1, s2, s3;
493 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
494
495 uint16x4_t d0 =
496 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
497 uint16x4_t d1 =
498 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
499 uint16x4_t d2 =
500 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
501 uint16x4_t d3 =
502 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
503
504 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
505
506 src += 4 * src_stride;
507 dst += 4 * dst_stride;
508 height -= 4;
509 } while (height != 0);
510 } else {
511 do {
512 const uint8_t *s = src;
513 CONV_BUF_TYPE *d = dst;
514 int width = w;
515
516 do {
517 uint8x8_t s0, s1, s2, s3;
518 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
519
520 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
521 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
522 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
523 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
524
525 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
526
527 s += 8;
528 d += 8;
529 width -= 8;
530 } while (width != 0);
531 src += 4 * src_stride;
532 dst += 4 * dst_stride;
533 height -= 4;
534 } while (height != 0);
535 }
536 }
537
av1_dist_wtd_convolve_2d_copy_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,ConvolveParams * conv_params)538 void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
539 uint8_t *dst8, int dst8_stride, int w,
540 int h, ConvolveParams *conv_params) {
541 if (conv_params->do_average) {
542 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
543 dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
544 src, src_stride, dst8, dst8_stride, w, h, conv_params);
545 } else {
546 dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
547 h, conv_params);
548 }
549 } else {
550 dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
551 }
552 }
553
convolve4_4_x(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t x_filter,const int16x4_t round_offset)554 static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
555 const int16x4_t s2, const int16x4_t s3,
556 const int16x4_t x_filter,
557 const int16x4_t round_offset) {
558 int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
559 sum = vmla_lane_s16(sum, s1, x_filter, 1);
560 sum = vmla_lane_s16(sum, s2, x_filter, 2);
561 sum = vmla_lane_s16(sum, s3, x_filter, 3);
562
563 // We halved the convolution filter values so -1 from the right shift.
564 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
565 return vreinterpret_u16_s16(res);
566 }
567
convolve8_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t x_filter,const int16x8_t round_offset)568 static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
569 const int16x8_t s2, const int16x8_t s3,
570 const int16x8_t s4, const int16x8_t s5,
571 const int16x8_t s6, const int16x8_t s7,
572 const int16x8_t x_filter,
573 const int16x8_t round_offset) {
574 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
575 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
576
577 int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
578 sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
579 sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
580 sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
581 sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
582 sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
583 sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
584 sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
585
586 // We halved the convolution filter values so -1 from the right shift.
587 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
588 return vreinterpretq_u16_s16(res);
589 }
590
dist_wtd_convolve_x_dist_wtd_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)591 static inline void dist_wtd_convolve_x_dist_wtd_avg_neon(
592 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
593 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
594 ConvolveParams *conv_params) {
595 assert(w % 4 == 0);
596 assert(h % 4 == 0);
597
598 const int bd = 8;
599 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
600 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
601 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
602 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
603
604 const uint16_t fwd_offset = conv_params->fwd_offset;
605 const uint16_t bck_offset = conv_params->bck_offset;
606
607 // Horizontal filter.
608 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
609 filter_params_x, subpel_x_qn & SUBPEL_MASK);
610
611 const int horiz_offset = filter_params_x->taps / 2 - 1;
612 const uint8_t *src_ptr = src - horiz_offset;
613 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
614 uint8_t *dst8_ptr = dst8;
615 int dst_stride = conv_params->dst_stride;
616 int height = h;
617
618 if (w == 4) {
619 // 4-tap filters are used for blocks having width <= 4.
620 // Filter values are even, so halve to reduce intermediate precision reqs.
621 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
622
623 src_ptr += 2;
624
625 do {
626 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
627 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
628 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
629
630 __builtin_prefetch(dst_ptr);
631 __builtin_prefetch(dst8_ptr);
632
633 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
634 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
635 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
636
637 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
638 vget_low_s16(round_offset_vec));
639
640 uint16x4_t dd0 = vld1_u16(dst_ptr);
641
642 uint8x8_t d01;
643 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
644 vget_low_s16(round_offset_vec), &d01);
645
646 store_u8_4x1(dst8_ptr, d01);
647
648 src_ptr += src_stride;
649 dst_ptr += dst_stride;
650 dst8_ptr += dst8_stride;
651 } while (--height != 0);
652 } else {
653 // Filter values are even, so halve to reduce intermediate precision reqs.
654 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
655
656 #if AOM_ARCH_AARCH64
657 while (height >= 8) {
658 const uint8_t *s = src_ptr;
659 CONV_BUF_TYPE *d = dst_ptr;
660 uint8_t *d_u8 = dst8_ptr;
661 int width = w;
662
663 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
664 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
665 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
666
667 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
668 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
669 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
670 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
671 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
672 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
673 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
674
675 __builtin_prefetch(d + 0 * dst_stride);
676 __builtin_prefetch(d + 1 * dst_stride);
677 __builtin_prefetch(d + 2 * dst_stride);
678 __builtin_prefetch(d + 3 * dst_stride);
679 __builtin_prefetch(d + 4 * dst_stride);
680 __builtin_prefetch(d + 5 * dst_stride);
681 __builtin_prefetch(d + 6 * dst_stride);
682 __builtin_prefetch(d + 7 * dst_stride);
683
684 s += 7;
685
686 do {
687 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
688 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
689
690 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
691 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
692 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
693 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
694 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
695 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
696 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
697 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
698
699 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
700 round_offset_vec);
701 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
702 round_offset_vec);
703 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
704 round_offset_vec);
705 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
706 round_offset_vec);
707 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
708 x_filter, round_offset_vec);
709 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
710 x_filter, round_offset_vec);
711 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
712 x_filter, round_offset_vec);
713 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
714 x_filter, round_offset_vec);
715
716 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
717
718 uint16x8_t dd0, dd1, dd2, dd3;
719 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
720
721 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
722 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
723 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
724 &d2_u8, &d3_u8);
725
726 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
727
728 uint16x8_t dd4, dd5, dd6, dd7;
729 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
730
731 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
732 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
733 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
734 &d6_u8, &d7_u8);
735
736 store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
737 d7_u8);
738
739 s0 = s8;
740 s1 = s9;
741 s2 = s10;
742 s3 = s11;
743 s4 = s12;
744 s5 = s13;
745 s6 = s14;
746 s += 8;
747 d += 8;
748 d_u8 += 8;
749 width -= 8;
750 } while (width != 0);
751 src_ptr += 8 * src_stride;
752 dst_ptr += 8 * dst_stride;
753 dst8_ptr += 8 * dst8_stride;
754 height -= 8;
755 }
756 #endif // AOM_ARCH_AARCH64
757
758 while (height > 0) {
759 const uint8_t *s = src_ptr;
760 CONV_BUF_TYPE *d = dst_ptr;
761 uint8_t *d_u8 = dst8_ptr;
762 int width = w;
763
764 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
765 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
766
767 __builtin_prefetch(d);
768
769 s += 8;
770
771 do {
772 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
773 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
774
775 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
776 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
777 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
778 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
779 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
780 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
781 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
782
783 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
784 round_offset_vec);
785
786 uint16x8_t dd0 = vld1q_u16(d);
787
788 uint8x8_t d0_u8;
789 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
790 round_offset_vec, &d0_u8);
791
792 vst1_u8(d_u8, d0_u8);
793
794 s0 = s8;
795 s += 8;
796 d += 8;
797 d_u8 += 8;
798 width -= 8;
799 } while (width != 0);
800 src_ptr += src_stride;
801 dst_ptr += dst_stride;
802 dst8_ptr += dst8_stride;
803 height--;
804 }
805 }
806 }
807
dist_wtd_convolve_x_avg_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)808 static inline void dist_wtd_convolve_x_avg_neon(
809 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
810 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
811 ConvolveParams *conv_params) {
812 assert(w % 4 == 0);
813 assert(h % 4 == 0);
814
815 const int bd = 8;
816 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
817 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
818 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
819 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
820
821 // Horizontal filter.
822 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
823 filter_params_x, subpel_x_qn & SUBPEL_MASK);
824
825 const int horiz_offset = filter_params_x->taps / 2 - 1;
826 const uint8_t *src_ptr = src - horiz_offset;
827 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
828 uint8_t *dst8_ptr = dst8;
829 int dst_stride = conv_params->dst_stride;
830 int height = h;
831
832 if (w == 4) {
833 // 4-tap filters are used for blocks having width <= 4.
834 // Filter values are even, so halve to reduce intermediate precision reqs.
835 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
836
837 src_ptr += 2;
838
839 do {
840 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
841 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
842 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
843
844 __builtin_prefetch(dst_ptr);
845 __builtin_prefetch(dst8_ptr);
846
847 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
848 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
849 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
850
851 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
852 vget_low_s16(round_offset_vec));
853
854 uint16x4_t dd0 = vld1_u16(dst_ptr);
855
856 uint8x8_t d01;
857 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
858
859 store_u8_4x1(dst8_ptr, d01);
860
861 src_ptr += src_stride;
862 dst_ptr += dst_stride;
863 dst8_ptr += dst8_stride;
864 } while (--height != 0);
865 } else {
866 // Filter values are even, so halve to reduce intermediate precision reqs.
867 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
868
869 #if AOM_ARCH_AARCH64
870 while (height >= 8) {
871 const uint8_t *s = src_ptr;
872 CONV_BUF_TYPE *d = dst_ptr;
873 uint8_t *d_u8 = dst8_ptr;
874 int width = w;
875
876 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
877 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
878 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
879
880 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
881 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
882 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
883 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
884 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
885 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
886 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
887
888 __builtin_prefetch(d + 0 * dst_stride);
889 __builtin_prefetch(d + 1 * dst_stride);
890 __builtin_prefetch(d + 2 * dst_stride);
891 __builtin_prefetch(d + 3 * dst_stride);
892 __builtin_prefetch(d + 4 * dst_stride);
893 __builtin_prefetch(d + 5 * dst_stride);
894 __builtin_prefetch(d + 6 * dst_stride);
895 __builtin_prefetch(d + 7 * dst_stride);
896
897 s += 7;
898
899 do {
900 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
901 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
902
903 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
904 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
905 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
906 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
907 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
908 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
909 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
910 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
911
912 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
913 round_offset_vec);
914 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
915 round_offset_vec);
916 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
917 round_offset_vec);
918 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
919 round_offset_vec);
920 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
921 x_filter, round_offset_vec);
922 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
923 x_filter, round_offset_vec);
924 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
925 x_filter, round_offset_vec);
926 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
927 x_filter, round_offset_vec);
928
929 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
930
931 uint16x8_t dd0, dd1, dd2, dd3;
932 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
933
934 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
935 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
936 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
937
938 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
939
940 uint16x8_t dd4, dd5, dd6, dd7;
941 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
942
943 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
944 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
945 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
946
947 store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
948 d7_u8);
949
950 s0 = s8;
951 s1 = s9;
952 s2 = s10;
953 s3 = s11;
954 s4 = s12;
955 s5 = s13;
956 s6 = s14;
957 s += 8;
958 d += 8;
959 d_u8 += 8;
960 width -= 8;
961 } while (width != 0);
962 src_ptr += 8 * src_stride;
963 dst_ptr += 8 * dst_stride;
964 dst8_ptr += 8 * dst8_stride;
965 height -= 8;
966 }
967 #endif // AOM_ARCH_AARCH64
968
969 while (height > 0) {
970 const uint8_t *s = src_ptr;
971 CONV_BUF_TYPE *d = dst_ptr;
972 uint8_t *d_u8 = dst8_ptr;
973 int width = w;
974
975 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
976 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
977
978 __builtin_prefetch(d);
979
980 s += 8;
981
982 do {
983 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
984 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
985
986 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
987 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
988 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
989 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
990 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
991 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
992 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
993
994 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
995 round_offset_vec);
996
997 uint16x8_t dd0 = vld1q_u16(d);
998
999 uint8x8_t d0_u8;
1000 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
1001
1002 vst1_u8(d_u8, d0_u8);
1003
1004 s0 = s8;
1005 s += 8;
1006 d += 8;
1007 d_u8 += 8;
1008 width -= 8;
1009 } while (width != 0);
1010 src_ptr += src_stride;
1011 dst_ptr += dst_stride;
1012 dst8_ptr += dst8_stride;
1013 height--;
1014 }
1015 }
1016 }
1017
dist_wtd_convolve_x_neon(const uint8_t * src,int src_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1018 static inline void dist_wtd_convolve_x_neon(
1019 const uint8_t *src, int src_stride, int w, int h,
1020 const InterpFilterParams *filter_params_x, const int subpel_x_qn,
1021 ConvolveParams *conv_params) {
1022 assert(w % 4 == 0);
1023 assert(h % 4 == 0);
1024
1025 const int bd = 8;
1026 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1027 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1028 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1029 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1030
1031 // Horizontal filter.
1032 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1033 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1034
1035 const int horiz_offset = filter_params_x->taps / 2 - 1;
1036 const uint8_t *src_ptr = src - horiz_offset;
1037 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1038 int dst_stride = conv_params->dst_stride;
1039 int height = h;
1040
1041 if (w == 4) {
1042 // 4-tap filters are used for blocks having width <= 4.
1043 // Filter values are even, so halve to reduce intermediate precision reqs.
1044 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
1045
1046 src_ptr += 2;
1047
1048 do {
1049 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
1050 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1051 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1052
1053 __builtin_prefetch(dst_ptr);
1054
1055 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
1056 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
1057 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
1058
1059 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
1060 vget_low_s16(round_offset_vec));
1061
1062 vst1_u16(dst_ptr, d0);
1063
1064 src_ptr += src_stride;
1065 dst_ptr += dst_stride;
1066 } while (--height != 0);
1067 } else {
1068 // Filter values are even, so halve to reduce intermediate precision reqs.
1069 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
1070
1071 #if AOM_ARCH_AARCH64
1072 while (height >= 8) {
1073 const uint8_t *s = src_ptr;
1074 CONV_BUF_TYPE *d = dst_ptr;
1075 int width = w;
1076
1077 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
1078 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1079 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1080
1081 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1082 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1083 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1084 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1085 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1086 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1087 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1088
1089 __builtin_prefetch(d + 0 * dst_stride);
1090 __builtin_prefetch(d + 1 * dst_stride);
1091 __builtin_prefetch(d + 2 * dst_stride);
1092 __builtin_prefetch(d + 3 * dst_stride);
1093 __builtin_prefetch(d + 4 * dst_stride);
1094 __builtin_prefetch(d + 5 * dst_stride);
1095 __builtin_prefetch(d + 6 * dst_stride);
1096 __builtin_prefetch(d + 7 * dst_stride);
1097
1098 s += 7;
1099
1100 do {
1101 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1103
1104 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1105 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1106 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1107 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1108 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1109 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1110 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1111 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1112
1113 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1114 round_offset_vec);
1115 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
1116 round_offset_vec);
1117 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
1118 round_offset_vec);
1119 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
1120 round_offset_vec);
1121 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
1122 x_filter, round_offset_vec);
1123 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
1124 x_filter, round_offset_vec);
1125 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
1126 x_filter, round_offset_vec);
1127 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
1128 x_filter, round_offset_vec);
1129
1130 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
1131
1132 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1133
1134 s0 = s8;
1135 s1 = s9;
1136 s2 = s10;
1137 s3 = s11;
1138 s4 = s12;
1139 s5 = s13;
1140 s6 = s14;
1141 s += 8;
1142 d += 8;
1143 width -= 8;
1144 } while (width != 0);
1145 src_ptr += 8 * src_stride;
1146 dst_ptr += 8 * dst_stride;
1147 height -= 8;
1148 }
1149 #endif // AOM_ARCH_AARCH64
1150
1151 while (height > 0) {
1152 const uint8_t *s = src_ptr;
1153 CONV_BUF_TYPE *d = dst_ptr;
1154 int width = w;
1155
1156 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
1157 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1158
1159 __builtin_prefetch(d);
1160
1161 s = src_ptr + 8;
1162
1163 do {
1164 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
1165 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
1166
1167 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
1168 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
1169 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
1170 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
1171 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
1172 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
1173 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
1174
1175 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1176 round_offset_vec);
1177
1178 vst1q_u16(d, d0);
1179
1180 s0 = s8;
1181 s += 8;
1182 d += 8;
1183 width -= 8;
1184 } while (width != 0);
1185 src_ptr += src_stride;
1186 dst_ptr += dst_stride;
1187 height--;
1188 }
1189 }
1190 }
1191
av1_dist_wtd_convolve_x_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1192 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
1193 uint8_t *dst8, int dst8_stride, int w, int h,
1194 const InterpFilterParams *filter_params_x,
1195 const int subpel_x_qn,
1196 ConvolveParams *conv_params) {
1197 if (conv_params->do_average) {
1198 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
1199 dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
1200 w, h, filter_params_x, subpel_x_qn,
1201 conv_params);
1202 } else {
1203 dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
1204 filter_params_x, subpel_x_qn, conv_params);
1205 }
1206 } else {
1207 dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
1208 subpel_x_qn, conv_params);
1209 }
1210 }
1211
convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int16x4_t round_offset)1212 static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
1213 const int16x4_t s2, const int16x4_t s3,
1214 const int16x4_t s4, const int16x4_t s5,
1215 const int16x8_t y_filter,
1216 const int16x4_t round_offset) {
1217 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1218 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1219
1220 // Filter values at indices 0 and 7 are 0.
1221 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
1222 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
1223 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
1224 sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
1225 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
1226 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
1227
1228 // We halved the convolution filter values so -1 from the right shift.
1229 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
1230 return vreinterpret_u16_s16(res);
1231 }
1232
convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int16x8_t round_offset)1233 static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
1234 const int16x8_t s2, const int16x8_t s3,
1235 const int16x8_t s4, const int16x8_t s5,
1236 const int16x8_t y_filter,
1237 const int16x8_t round_offset) {
1238 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1239 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1240
1241 // Filter values at indices 0 and 7 are 0.
1242 int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
1243 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
1244 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
1245 sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
1246 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
1247 sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
1248
1249 // We halved the convolution filter values so -1 from the right shift.
1250 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
1251 return vreinterpretq_u16_s16(res);
1252 }
1253
dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1254 static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
1255 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1256 const int dst8_stride, int w, int h, const int16x8_t y_filter,
1257 ConvolveParams *conv_params) {
1258 const int bd = 8;
1259 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1260 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1261 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1262 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1263
1264 const uint16_t fwd_offset = conv_params->fwd_offset;
1265 const uint16_t bck_offset = conv_params->bck_offset;
1266
1267 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1268 const int dst_stride = conv_params->dst_stride;
1269 int width = w;
1270
1271 if (w == 4 || h == 4) {
1272 do {
1273 const uint8_t *s = src_ptr;
1274 CONV_BUF_TYPE *d = dst_ptr;
1275 uint8_t *d_u8 = dst8_ptr;
1276 int height = h;
1277
1278 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1279 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1280 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1281 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1282 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1283
1284 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1285 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1286 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1287 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1288 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1289
1290 s += 5 * src_stride;
1291
1292 do {
1293 #if AOM_ARCH_AARCH64
1294 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1295 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1296 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1297 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1298
1299 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1300 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1301 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1302 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1303
1304 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1305 vget_low_s16(round_offset_vec));
1306 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1307 vget_low_s16(round_offset_vec));
1308 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1309 vget_low_s16(round_offset_vec));
1310 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1311 vget_low_s16(round_offset_vec));
1312
1313 uint16x4_t dd0, dd1, dd2, dd3;
1314 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1315
1316 uint8x8_t d01, d23;
1317 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1318 bck_offset, round_offset_vec, &d01, &d23);
1319
1320 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1321 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1322
1323 s0 = s4;
1324 s1 = s5;
1325 s2 = s6;
1326 s3 = s7;
1327 s4 = s8;
1328 s += 4 * src_stride;
1329 d += 4 * dst_stride;
1330 d_u8 += 4 * dst8_stride;
1331 height -= 4;
1332 #else // !AOM_ARCH_AARCH64
1333 t0 = load_unaligned_u8_4x1(s);
1334 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1335
1336 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1337 vget_low_s16(round_offset_vec));
1338
1339 uint16x4_t dd0 = vld1_u16(d);
1340
1341 uint8x8_t d01;
1342 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
1343 vget_low_s16(round_offset_vec), &d01);
1344
1345 store_u8_4x1(d_u8, d01);
1346
1347 s0 = s1;
1348 s1 = s2;
1349 s2 = s3;
1350 s3 = s4;
1351 s4 = s5;
1352 s += src_stride;
1353 d += dst_stride;
1354 d_u8 += dst8_stride;
1355 height--;
1356 #endif // AOM_ARCH_AARCH64
1357 } while (height != 0);
1358 src_ptr += 4;
1359 dst_ptr += 4;
1360 dst8_ptr += 4;
1361 width -= 4;
1362 } while (width != 0);
1363 } else {
1364 do {
1365 const uint8_t *s = src_ptr + (5 * src_stride);
1366 CONV_BUF_TYPE *d = dst_ptr;
1367 uint8_t *d_u8 = dst8_ptr;
1368 int height = h;
1369
1370 uint8x8_t t0, t1, t2, t3, t4;
1371 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1372
1373 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1374 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1375 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1376 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1377 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1378
1379 do {
1380 #if AOM_ARCH_AARCH64
1381 uint8x8_t t5, t6, t7;
1382 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1383
1384 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1385 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1386 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1387 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1388 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1389 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1390 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1391 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1392
1393 uint16x8_t d0 =
1394 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1395 uint16x8_t d1 =
1396 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1397 uint16x8_t d2 =
1398 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1399 uint16x8_t d3 =
1400 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1401 uint16x8_t d4 =
1402 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1403 uint16x8_t d5 =
1404 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1405 uint16x8_t d6 =
1406 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1407 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1408 round_offset_vec);
1409
1410 uint16x8_t dd0, dd1, dd2, dd3;
1411 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1412
1413 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
1414 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1415 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
1416 &d2_u8, &d3_u8);
1417
1418 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
1419 d_u8 += 4 * dst8_stride;
1420
1421 uint16x8_t dd4, dd5, dd6, dd7;
1422 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
1423
1424 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
1425 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
1426 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
1427 &d6_u8, &d7_u8);
1428
1429 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
1430 d_u8 += 4 * dst8_stride;
1431
1432 s0 = s8;
1433 s1 = s9;
1434 s2 = s10;
1435 s3 = s11;
1436 s4 = s12;
1437 s += 8 * src_stride;
1438 d += 8 * dst_stride;
1439 height -= 8;
1440 #else // !AOM_ARCH_AARCH64
1441 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1442
1443 uint16x8_t d0 =
1444 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1445
1446 s0 = s1;
1447 s1 = s2;
1448 s2 = s3;
1449 s3 = s4;
1450 s4 = s5;
1451
1452 uint16x8_t dd0 = vld1q_u16(d);
1453
1454 uint8x8_t d0_u8;
1455 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
1456 round_offset_vec, &d0_u8);
1457
1458 vst1_u8(d_u8, d0_u8);
1459 d_u8 += dst8_stride;
1460
1461 s += src_stride;
1462 d += dst_stride;
1463 height--;
1464 #endif // AOM_ARCH_AARCH64
1465 } while (height != 0);
1466 src_ptr += 8;
1467 dst_ptr += 8;
1468 dst8_ptr += 8;
1469 width -= 8;
1470 } while (width != 0);
1471 }
1472 }
1473
dist_wtd_convolve_y_6tap_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1474 static inline void dist_wtd_convolve_y_6tap_avg_neon(
1475 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1476 const int dst8_stride, int w, int h, const int16x8_t y_filter,
1477 ConvolveParams *conv_params) {
1478 const int bd = 8;
1479 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1480 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1481 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1482 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1483
1484 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1485 const int dst_stride = conv_params->dst_stride;
1486 int width = w;
1487
1488 if (w == 4 || h == 4) {
1489 do {
1490 const uint8_t *s = src_ptr;
1491 CONV_BUF_TYPE *d = dst_ptr;
1492 uint8_t *d_u8 = dst8_ptr;
1493 int height = h;
1494
1495 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1496 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1497 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1498 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1499 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1500
1501 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1502 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1503 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1504 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1505 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1506
1507 s += 5 * src_stride;
1508
1509 do {
1510 #if AOM_ARCH_AARCH64
1511 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1512 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1513 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1514 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1515
1516 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1517 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1518 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1519 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1520
1521 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1522 vget_low_s16(round_offset_vec));
1523 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1524 vget_low_s16(round_offset_vec));
1525 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1526 vget_low_s16(round_offset_vec));
1527 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1528 vget_low_s16(round_offset_vec));
1529
1530 uint16x4_t dd0, dd1, dd2, dd3;
1531 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1532
1533 uint8x8_t d01, d23;
1534 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
1535 round_offset_vec, &d01, &d23);
1536
1537 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1538 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1539
1540 s0 = s4;
1541 s1 = s5;
1542 s2 = s6;
1543 s3 = s7;
1544 s4 = s8;
1545 s += 4 * src_stride;
1546 d += 4 * dst_stride;
1547 d_u8 += 4 * dst8_stride;
1548 height -= 4;
1549 #else // !AOM_ARCH_AARCH64
1550 t0 = load_unaligned_u8_4x1(s);
1551 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1552
1553 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1554 vget_low_s16(round_offset_vec));
1555
1556 uint16x4_t dd0 = vld1_u16(d);
1557
1558 uint8x8_t d01;
1559 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
1560
1561 store_u8_4x1(d_u8, d01);
1562
1563 s0 = s1;
1564 s1 = s2;
1565 s2 = s3;
1566 s3 = s4;
1567 s4 = s5;
1568 s += src_stride;
1569 d += dst_stride;
1570 d_u8 += dst8_stride;
1571 height--;
1572 #endif // AOM_ARCH_AARCH64
1573 } while (height != 0);
1574 src_ptr += 4;
1575 dst_ptr += 4;
1576 dst8_ptr += 4;
1577 width -= 4;
1578 } while (width != 0);
1579 } else {
1580 do {
1581 const uint8_t *s = src_ptr + (5 * src_stride);
1582 CONV_BUF_TYPE *d = dst_ptr;
1583 uint8_t *d_u8 = dst8_ptr;
1584 int height = h;
1585
1586 uint8x8_t t0, t1, t2, t3, t4;
1587 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1588
1589 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1590 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1591 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1592 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1593 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1594
1595 do {
1596 #if AOM_ARCH_AARCH64
1597 uint8x8_t t5, t6, t7;
1598 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1599
1600 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1601 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1602 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1603 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1604 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1605 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1606 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1607 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1608
1609 uint16x8_t d0 =
1610 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1611 uint16x8_t d1 =
1612 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1613 uint16x8_t d2 =
1614 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1615 uint16x8_t d3 =
1616 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1617 uint16x8_t d4 =
1618 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1619 uint16x8_t d5 =
1620 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1621 uint16x8_t d6 =
1622 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1623 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1624 round_offset_vec);
1625
1626 uint16x8_t dd0, dd1, dd2, dd3;
1627 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1628
1629 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
1630 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
1631 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
1632
1633 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
1634 d_u8 += 4 * dst8_stride;
1635
1636 uint16x8_t dd4, dd5, dd6, dd7;
1637 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
1638
1639 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
1640 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
1641 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
1642
1643 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
1644 d_u8 += 4 * dst8_stride;
1645
1646 s0 = s8;
1647 s1 = s9;
1648 s2 = s10;
1649 s3 = s11;
1650 s4 = s12;
1651 s += 8 * src_stride;
1652 d += 8 * dst_stride;
1653 height -= 8;
1654 #else // !AOM_ARCH_AARCH64
1655 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1656
1657 uint16x8_t d0 =
1658 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1659
1660 s0 = s1;
1661 s1 = s2;
1662 s2 = s3;
1663 s3 = s4;
1664 s4 = s5;
1665
1666 uint16x8_t dd0 = vld1q_u16(d);
1667
1668 uint8x8_t d0_u8;
1669 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
1670
1671 vst1_u8(d_u8, d0_u8);
1672 d_u8 += dst8_stride;
1673
1674 s += src_stride;
1675 d += dst_stride;
1676 height--;
1677 #endif // AOM_ARCH_AARCH64
1678 } while (height != 0);
1679 src_ptr += 8;
1680 dst_ptr += 8;
1681 dst8_ptr += 8;
1682 width -= 8;
1683 } while (width != 0);
1684 }
1685 }
1686
dist_wtd_convolve_y_6tap_neon(const uint8_t * src_ptr,int src_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1687 static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
1688 int src_stride, int w, int h,
1689 const int16x8_t y_filter,
1690 ConvolveParams *conv_params) {
1691 const int bd = 8;
1692 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1693 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1694 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1695 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1696
1697 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1698 const int dst_stride = conv_params->dst_stride;
1699 int width = w;
1700
1701 if (w == 4 || h == 4) {
1702 do {
1703 const uint8_t *s = src_ptr;
1704 CONV_BUF_TYPE *d = dst_ptr;
1705 int height = h;
1706
1707 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1708 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1709 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1710 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1711 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1712
1713 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1714 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1715 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1716 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1717 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1718
1719 s += 5 * src_stride;
1720
1721 do {
1722 #if AOM_ARCH_AARCH64
1723 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1724 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1725 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1726 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1727
1728 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1729 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1730 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1731 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1732
1733 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1734 vget_low_s16(round_offset_vec));
1735 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
1736 vget_low_s16(round_offset_vec));
1737 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
1738 vget_low_s16(round_offset_vec));
1739 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
1740 vget_low_s16(round_offset_vec));
1741
1742 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1743
1744 s0 = s4;
1745 s1 = s5;
1746 s2 = s6;
1747 s3 = s7;
1748 s4 = s8;
1749 s += 4 * src_stride;
1750 d += 4 * dst_stride;
1751 height -= 4;
1752 #else // !AOM_ARCH_AARCH64
1753 t0 = load_unaligned_u8_4x1(s);
1754 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1755
1756 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
1757 vget_low_s16(round_offset_vec));
1758
1759 vst1_u16(d, d0);
1760
1761 s0 = s1;
1762 s1 = s2;
1763 s2 = s3;
1764 s3 = s4;
1765 s4 = s5;
1766 s += src_stride;
1767 d += dst_stride;
1768 height--;
1769 #endif // AOM_ARCH_AARCH64
1770 } while (height != 0);
1771 src_ptr += 4;
1772 dst_ptr += 4;
1773 width -= 4;
1774 } while (width != 0);
1775 } else {
1776 do {
1777 const uint8_t *s = src_ptr + (5 * src_stride);
1778 CONV_BUF_TYPE *d = dst_ptr;
1779 int height = h;
1780
1781 uint8x8_t t0, t1, t2, t3, t4;
1782 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
1783
1784 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1785 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1786 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1787 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1788 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1789
1790 do {
1791 #if AOM_ARCH_AARCH64
1792 uint8x8_t t5, t6, t7;
1793 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1794
1795 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
1796 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
1797 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
1798 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
1799 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
1800 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
1801 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
1802 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
1803
1804 uint16x8_t d0 =
1805 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1806 uint16x8_t d1 =
1807 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
1808 uint16x8_t d2 =
1809 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
1810 uint16x8_t d3 =
1811 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
1812 uint16x8_t d4 =
1813 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
1814 uint16x8_t d5 =
1815 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
1816 uint16x8_t d6 =
1817 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
1818 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
1819 round_offset_vec);
1820
1821 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1822
1823 s0 = s8;
1824 s1 = s9;
1825 s2 = s10;
1826 s3 = s11;
1827 s4 = s12;
1828 s += 8 * src_stride;
1829 d += 8 * dst_stride;
1830 height -= 8;
1831 #else // !AOM_ARCH_AARCH64
1832 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
1833
1834 uint16x8_t d0 =
1835 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
1836
1837 s0 = s1;
1838 s1 = s2;
1839 s2 = s3;
1840 s3 = s4;
1841 s4 = s5;
1842
1843 vst1q_u16(d, d0);
1844
1845 s += src_stride;
1846 d += dst_stride;
1847 height--;
1848 #endif // AOM_ARCH_AARCH64
1849 } while (height != 0);
1850 src_ptr += 8;
1851 dst_ptr += 8;
1852 width -= 8;
1853 } while (width != 0);
1854 }
1855 }
1856
convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int16x4_t round_offset)1857 static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
1858 const int16x4_t s2, const int16x4_t s3,
1859 const int16x4_t s4, const int16x4_t s5,
1860 const int16x4_t s6, const int16x4_t s7,
1861 const int16x8_t y_filter,
1862 const int16x4_t round_offset) {
1863 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1864 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1865
1866 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
1867 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
1868 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
1869 sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
1870 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
1871 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
1872 sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
1873 sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
1874
1875 // We halved the convolution filter values so -1 from the right shift.
1876 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
1877 return vreinterpret_u16_s16(res);
1878 }
1879
convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int16x8_t round_offset)1880 static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
1881 const int16x8_t s2, const int16x8_t s3,
1882 const int16x8_t s4, const int16x8_t s5,
1883 const int16x8_t s6, const int16x8_t s7,
1884 const int16x8_t y_filter,
1885 const int16x8_t round_offset) {
1886 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1887 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1888
1889 int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
1890 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
1891 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
1892 sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
1893 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
1894 sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
1895 sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
1896 sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
1897
1898 // We halved the convolution filter values so -1 from the right shift.
1899 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
1900 return vreinterpretq_u16_s16(res);
1901 }
1902
dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)1903 static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
1904 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
1905 const int dst8_stride, int w, int h, const int16x8_t y_filter,
1906 ConvolveParams *conv_params) {
1907 const int bd = 8;
1908 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
1909 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
1910 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
1911 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
1912
1913 const uint16_t fwd_offset = conv_params->fwd_offset;
1914 const uint16_t bck_offset = conv_params->bck_offset;
1915
1916 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
1917 const int dst_stride = conv_params->dst_stride;
1918 int width = w;
1919
1920 if (w == 4 || h == 4) {
1921 do {
1922 const uint8_t *s = src_ptr;
1923 CONV_BUF_TYPE *d = dst_ptr;
1924 uint8_t *d_u8 = dst8_ptr;
1925 int height = h;
1926
1927 __builtin_prefetch(s + 0 * src_stride);
1928 __builtin_prefetch(s + 1 * src_stride);
1929 __builtin_prefetch(s + 2 * src_stride);
1930 __builtin_prefetch(s + 3 * src_stride);
1931
1932 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1933 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1934 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1935 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1936 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
1937 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
1938 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
1939
1940 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1941 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1942 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1943 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1944 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
1945 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
1946 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
1947
1948 __builtin_prefetch(d + 0 * dst_stride);
1949 __builtin_prefetch(d + 1 * dst_stride);
1950 __builtin_prefetch(d + 2 * dst_stride);
1951 __builtin_prefetch(d + 3 * dst_stride);
1952
1953 s += 7 * src_stride;
1954
1955 do {
1956 #if AOM_ARCH_AARCH64
1957 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
1958 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
1959 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
1960 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
1961
1962 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
1963 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
1964 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
1965 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
1966
1967 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1968 vget_low_s16(round_offset_vec));
1969 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1970 vget_low_s16(round_offset_vec));
1971 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1972 vget_low_s16(round_offset_vec));
1973 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1974 vget_low_s16(round_offset_vec));
1975
1976 __builtin_prefetch(d + 0 * dst_stride);
1977 __builtin_prefetch(d + 1 * dst_stride);
1978 __builtin_prefetch(d + 2 * dst_stride);
1979 __builtin_prefetch(d + 3 * dst_stride);
1980
1981 __builtin_prefetch(d_u8 + 0 * dst8_stride);
1982 __builtin_prefetch(d_u8 + 1 * dst8_stride);
1983 __builtin_prefetch(d_u8 + 2 * dst8_stride);
1984 __builtin_prefetch(d_u8 + 3 * dst8_stride);
1985
1986 uint16x4_t dd0, dd1, dd2, dd3;
1987 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
1988
1989 uint8x8_t d01, d23;
1990 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
1991 bck_offset, round_offset_vec, &d01, &d23);
1992
1993 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
1994 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
1995
1996 s0 = s4;
1997 s1 = s5;
1998 s2 = s6;
1999 s3 = s7;
2000 s4 = s8;
2001 s5 = s9;
2002 s6 = s10;
2003 s += 4 * src_stride;
2004 d += 4 * dst_stride;
2005 d_u8 += 4 * dst8_stride;
2006 height -= 4;
2007 #else // !AOM_ARCH_AARCH64
2008 t0 = load_unaligned_u8_4x1(s);
2009 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2010
2011 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2012 vget_low_s16(round_offset_vec));
2013
2014 __builtin_prefetch(d);
2015
2016 uint16x4_t dd0 = vld1_u16(d);
2017
2018 uint8x8_t d01;
2019 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
2020 vget_low_s16(round_offset_vec), &d01);
2021
2022 store_u8_4x1(d_u8, d01);
2023
2024 s0 = s1;
2025 s1 = s2;
2026 s2 = s3;
2027 s3 = s4;
2028 s4 = s5;
2029 s5 = s6;
2030 s6 = s7;
2031 s += src_stride;
2032 d += dst_stride;
2033 d_u8 += dst8_stride;
2034 height--;
2035 #endif // AOM_ARCH_AARCH64
2036 } while (height != 0);
2037 src_ptr += 4;
2038 dst_ptr += 4;
2039 dst8_ptr += 4;
2040 width -= 4;
2041 } while (width != 0);
2042 } else {
2043 do {
2044 const uint8_t *s = src_ptr;
2045 CONV_BUF_TYPE *d = dst_ptr;
2046 uint8_t *d_u8 = dst8_ptr;
2047 int height = h;
2048
2049 __builtin_prefetch(s + 0 * src_stride);
2050 __builtin_prefetch(s + 1 * src_stride);
2051 __builtin_prefetch(s + 2 * src_stride);
2052 __builtin_prefetch(s + 3 * src_stride);
2053 __builtin_prefetch(s + 4 * src_stride);
2054 __builtin_prefetch(s + 5 * src_stride);
2055 __builtin_prefetch(s + 6 * src_stride);
2056 __builtin_prefetch(s + 7 * src_stride);
2057
2058 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2059 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2060
2061 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2062 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2063 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2064 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2065 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2066 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2067 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2068
2069 s += 7 * src_stride;
2070
2071 do {
2072 #if AOM_ARCH_AARCH64
2073 uint8x8_t t7;
2074 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2075
2076 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2077 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2078 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2079 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2080 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2081 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2082 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2083 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2084
2085 __builtin_prefetch(dst_ptr + 0 * dst_stride);
2086 __builtin_prefetch(dst_ptr + 1 * dst_stride);
2087 __builtin_prefetch(dst_ptr + 2 * dst_stride);
2088 __builtin_prefetch(dst_ptr + 3 * dst_stride);
2089
2090 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2091 round_offset_vec);
2092 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2093 round_offset_vec);
2094 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2095 round_offset_vec);
2096 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2097 round_offset_vec);
2098 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2099 y_filter, round_offset_vec);
2100 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2101 y_filter, round_offset_vec);
2102 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2103 y_filter, round_offset_vec);
2104 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2105 y_filter, round_offset_vec);
2106
2107 __builtin_prefetch(d + 0 * dst8_stride);
2108 __builtin_prefetch(d + 1 * dst8_stride);
2109 __builtin_prefetch(d + 2 * dst8_stride);
2110 __builtin_prefetch(d + 3 * dst8_stride);
2111
2112 uint16x8_t dd0, dd1, dd2, dd3;
2113 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2114
2115 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
2116 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
2117 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
2118 &d2_u8, &d3_u8);
2119
2120 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
2121 d_u8 += 4 * dst8_stride;
2122
2123 uint16x8_t dd4, dd5, dd6, dd7;
2124 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
2125
2126 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
2127 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
2128 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
2129 &d6_u8, &d7_u8);
2130
2131 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
2132 d_u8 += 4 * dst8_stride;
2133
2134 s0 = s8;
2135 s1 = s9;
2136 s2 = s10;
2137 s3 = s11;
2138 s4 = s12;
2139 s5 = s13;
2140 s6 = s14;
2141 s += 8 * src_stride;
2142 d += 8 * dst_stride;
2143 height -= 8;
2144 #else // !AOM_ARCH_AARCH64
2145 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2146
2147 __builtin_prefetch(dst_ptr);
2148
2149 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2150 round_offset_vec);
2151
2152 s0 = s1;
2153 s1 = s2;
2154 s2 = s3;
2155 s3 = s4;
2156 s4 = s5;
2157 s5 = s6;
2158 s6 = s7;
2159
2160 __builtin_prefetch(d);
2161
2162 uint16x8_t dd0 = vld1q_u16(d);
2163
2164 uint8x8_t d0_u8;
2165 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
2166 round_offset_vec, &d0_u8);
2167
2168 vst1_u8(d_u8, d0_u8);
2169 d_u8 += dst8_stride;
2170
2171 s += src_stride;
2172 d += dst_stride;
2173 height--;
2174 #endif // AOM_ARCH_AARCH64
2175 } while (height != 0);
2176 src_ptr += 8;
2177 dst_ptr += 8;
2178 dst8_ptr += 8;
2179 width -= 8;
2180 } while (width != 0);
2181 }
2182 }
2183
dist_wtd_convolve_y_8tap_avg_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst8_ptr,const int dst8_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)2184 static inline void dist_wtd_convolve_y_8tap_avg_neon(
2185 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
2186 const int dst8_stride, int w, int h, const int16x8_t y_filter,
2187 ConvolveParams *conv_params) {
2188 const int bd = 8;
2189 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
2190 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
2191 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
2192 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
2193
2194 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
2195 const int dst_stride = conv_params->dst_stride;
2196 int width = w;
2197
2198 if (w == 4 || h == 4) {
2199 do {
2200 const uint8_t *s = src_ptr;
2201 CONV_BUF_TYPE *d = dst_ptr;
2202 uint8_t *d_u8 = dst8_ptr;
2203 int height = h;
2204
2205 __builtin_prefetch(s + 0 * src_stride);
2206 __builtin_prefetch(s + 1 * src_stride);
2207 __builtin_prefetch(s + 2 * src_stride);
2208 __builtin_prefetch(s + 3 * src_stride);
2209
2210 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2211 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2212 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2213 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2214 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
2215 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
2216 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
2217
2218 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2219 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2220 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2221 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2222 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
2223 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
2224 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
2225
2226 __builtin_prefetch(d + 0 * dst_stride);
2227 __builtin_prefetch(d + 1 * dst_stride);
2228 __builtin_prefetch(d + 2 * dst_stride);
2229 __builtin_prefetch(d + 3 * dst_stride);
2230
2231 s += 7 * src_stride;
2232
2233 do {
2234 #if AOM_ARCH_AARCH64
2235 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2236 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2237 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2238 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2239
2240 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2241 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2242 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2243 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2244
2245 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2246 vget_low_s16(round_offset_vec));
2247 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2248 vget_low_s16(round_offset_vec));
2249 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2250 vget_low_s16(round_offset_vec));
2251 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2252 vget_low_s16(round_offset_vec));
2253
2254 __builtin_prefetch(d + 0 * dst_stride);
2255 __builtin_prefetch(d + 1 * dst_stride);
2256 __builtin_prefetch(d + 2 * dst_stride);
2257 __builtin_prefetch(d + 3 * dst_stride);
2258
2259 __builtin_prefetch(d_u8 + 0 * dst8_stride);
2260 __builtin_prefetch(d_u8 + 1 * dst8_stride);
2261 __builtin_prefetch(d_u8 + 2 * dst8_stride);
2262 __builtin_prefetch(d_u8 + 3 * dst8_stride);
2263
2264 uint16x4_t dd0, dd1, dd2, dd3;
2265 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2266
2267 uint8x8_t d01, d23;
2268 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
2269 round_offset_vec, &d01, &d23);
2270
2271 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
2272 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
2273
2274 s0 = s4;
2275 s1 = s5;
2276 s2 = s6;
2277 s3 = s7;
2278 s4 = s8;
2279 s5 = s9;
2280 s6 = s10;
2281 s += 4 * src_stride;
2282 d += 4 * dst_stride;
2283 d_u8 += 4 * dst8_stride;
2284 height -= 4;
2285 #else // !AOM_ARCH_AARCH64
2286 t0 = load_unaligned_u8_4x1(s);
2287 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2288
2289 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2290 vget_low_s16(round_offset_vec));
2291
2292 __builtin_prefetch(d);
2293
2294 uint16x4_t dd0 = vld1_u16(d);
2295
2296 uint8x8_t d01;
2297 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
2298
2299 store_u8_4x1(d_u8, d01);
2300
2301 s0 = s1;
2302 s1 = s2;
2303 s2 = s3;
2304 s3 = s4;
2305 s4 = s5;
2306 s5 = s6;
2307 s6 = s7;
2308 s += src_stride;
2309 d += dst_stride;
2310 d_u8 += dst8_stride;
2311 height--;
2312 #endif // AOM_ARCH_AARCH64
2313 } while (height != 0);
2314 src_ptr += 4;
2315 dst_ptr += 4;
2316 dst8_ptr += 4;
2317 width -= 4;
2318 } while (width != 0);
2319 } else {
2320 do {
2321 const uint8_t *s = src_ptr;
2322 CONV_BUF_TYPE *d = dst_ptr;
2323 uint8_t *d_u8 = dst8_ptr;
2324 int height = h;
2325
2326 __builtin_prefetch(s + 0 * src_stride);
2327 __builtin_prefetch(s + 1 * src_stride);
2328 __builtin_prefetch(s + 2 * src_stride);
2329 __builtin_prefetch(s + 3 * src_stride);
2330 __builtin_prefetch(s + 4 * src_stride);
2331 __builtin_prefetch(s + 5 * src_stride);
2332 __builtin_prefetch(s + 6 * src_stride);
2333 __builtin_prefetch(s + 7 * src_stride);
2334
2335 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2336 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2337
2338 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2339 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2340 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2341 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2342 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2343 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2344 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2345
2346 s += 7 * src_stride;
2347
2348 do {
2349 #if AOM_ARCH_AARCH64
2350 uint8x8_t t7;
2351 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2352
2353 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2354 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2355 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2356 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2357 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2358 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2359 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2360 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2361
2362 __builtin_prefetch(dst_ptr + 0 * dst_stride);
2363 __builtin_prefetch(dst_ptr + 1 * dst_stride);
2364 __builtin_prefetch(dst_ptr + 2 * dst_stride);
2365 __builtin_prefetch(dst_ptr + 3 * dst_stride);
2366
2367 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2368 round_offset_vec);
2369 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2370 round_offset_vec);
2371 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2372 round_offset_vec);
2373 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2374 round_offset_vec);
2375 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2376 y_filter, round_offset_vec);
2377 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2378 y_filter, round_offset_vec);
2379 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2380 y_filter, round_offset_vec);
2381 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2382 y_filter, round_offset_vec);
2383
2384 __builtin_prefetch(d + 0 * dst8_stride);
2385 __builtin_prefetch(d + 1 * dst8_stride);
2386 __builtin_prefetch(d + 2 * dst8_stride);
2387 __builtin_prefetch(d + 3 * dst8_stride);
2388
2389 uint16x8_t dd0, dd1, dd2, dd3;
2390 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
2391
2392 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
2393 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
2394 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
2395
2396 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
2397 d_u8 += 4 * dst8_stride;
2398
2399 uint16x8_t dd4, dd5, dd6, dd7;
2400 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
2401
2402 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
2403 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
2404 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
2405
2406 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
2407 d_u8 += 4 * dst8_stride;
2408
2409 s0 = s8;
2410 s1 = s9;
2411 s2 = s10;
2412 s3 = s11;
2413 s4 = s12;
2414 s5 = s13;
2415 s6 = s14;
2416 s += 8 * src_stride;
2417 d += 8 * dst_stride;
2418 height -= 8;
2419 #else // !AOM_ARCH_AARCH64
2420 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2421
2422 __builtin_prefetch(dst_ptr);
2423
2424 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2425 round_offset_vec);
2426
2427 s0 = s1;
2428 s1 = s2;
2429 s2 = s3;
2430 s3 = s4;
2431 s4 = s5;
2432 s5 = s6;
2433 s6 = s7;
2434
2435 __builtin_prefetch(d);
2436
2437 uint16x8_t dd0 = vld1q_u16(d);
2438
2439 uint8x8_t d0_u8;
2440 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
2441
2442 vst1_u8(d_u8, d0_u8);
2443 d_u8 += dst8_stride;
2444
2445 s += src_stride;
2446 d += dst_stride;
2447 height--;
2448 #endif // AOM_ARCH_AARCH64
2449 } while (height != 0);
2450 src_ptr += 8;
2451 dst_ptr += 8;
2452 dst8_ptr += 8;
2453 width -= 8;
2454 } while (width != 0);
2455 }
2456 }
2457
dist_wtd_convolve_y_8tap_neon(const uint8_t * src_ptr,int src_stride,int w,int h,const int16x8_t y_filter,ConvolveParams * conv_params)2458 static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
2459 int src_stride, int w, int h,
2460 const int16x8_t y_filter,
2461 ConvolveParams *conv_params) {
2462 const int bd = 8;
2463 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
2464 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
2465 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
2466 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
2467
2468 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
2469 const int dst_stride = conv_params->dst_stride;
2470 int width = w;
2471
2472 if (w == 4 || h == 4) {
2473 do {
2474 const uint8_t *s = src_ptr;
2475 CONV_BUF_TYPE *d = dst_ptr;
2476 int height = h;
2477
2478 __builtin_prefetch(s + 0 * src_stride);
2479 __builtin_prefetch(s + 1 * src_stride);
2480 __builtin_prefetch(s + 2 * src_stride);
2481 __builtin_prefetch(s + 3 * src_stride);
2482
2483 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2484 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2485 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2486 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2487 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
2488 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
2489 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
2490
2491 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2492 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2493 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2494 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2495 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
2496 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
2497 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
2498
2499 __builtin_prefetch(d + 0 * dst_stride);
2500 __builtin_prefetch(d + 1 * dst_stride);
2501 __builtin_prefetch(d + 2 * dst_stride);
2502 __builtin_prefetch(d + 3 * dst_stride);
2503
2504 s += 7 * src_stride;
2505
2506 do {
2507 #if AOM_ARCH_AARCH64
2508 t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
2509 t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
2510 t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
2511 t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
2512
2513 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2514 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
2515 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
2516 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
2517
2518 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2519 vget_low_s16(round_offset_vec));
2520 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2521 vget_low_s16(round_offset_vec));
2522 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2523 vget_low_s16(round_offset_vec));
2524 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2525 vget_low_s16(round_offset_vec));
2526
2527 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
2528
2529 s0 = s4;
2530 s1 = s5;
2531 s2 = s6;
2532 s3 = s7;
2533 s4 = s8;
2534 s5 = s9;
2535 s6 = s10;
2536 s += 4 * src_stride;
2537 d += 4 * dst_stride;
2538 height -= 4;
2539 #else // !AOM_ARCH_AARCH64
2540 t0 = load_unaligned_u8_4x1(s);
2541 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
2542
2543 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2544 vget_low_s16(round_offset_vec));
2545
2546 vst1_u16(d, d0);
2547
2548 s0 = s1;
2549 s1 = s2;
2550 s2 = s3;
2551 s3 = s4;
2552 s4 = s5;
2553 s5 = s6;
2554 s6 = s7;
2555 s += src_stride;
2556 d += dst_stride;
2557 height--;
2558 #endif // AOM_ARCH_AARCH64
2559 } while (height != 0);
2560 src_ptr += 4;
2561 dst_ptr += 4;
2562 width -= 4;
2563 } while (width != 0);
2564 } else {
2565 do {
2566 const uint8_t *s = src_ptr;
2567 CONV_BUF_TYPE *d = dst_ptr;
2568 int height = h;
2569
2570 __builtin_prefetch(s + 0 * src_stride);
2571 __builtin_prefetch(s + 1 * src_stride);
2572 __builtin_prefetch(s + 2 * src_stride);
2573 __builtin_prefetch(s + 3 * src_stride);
2574 __builtin_prefetch(s + 4 * src_stride);
2575 __builtin_prefetch(s + 5 * src_stride);
2576 __builtin_prefetch(s + 6 * src_stride);
2577 __builtin_prefetch(s + 7 * src_stride);
2578
2579 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
2580 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
2581
2582 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
2583 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
2584 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
2585 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
2586 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
2587 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
2588 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
2589
2590 s += 7 * src_stride;
2591
2592 do {
2593 #if AOM_ARCH_AARCH64
2594 uint8x8_t t7;
2595 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
2596
2597 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
2598 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
2599 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
2600 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
2601 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
2602 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
2603 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
2604 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
2605
2606 __builtin_prefetch(dst_ptr + 0 * dst_stride);
2607 __builtin_prefetch(dst_ptr + 1 * dst_stride);
2608 __builtin_prefetch(dst_ptr + 2 * dst_stride);
2609 __builtin_prefetch(dst_ptr + 3 * dst_stride);
2610
2611 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2612 round_offset_vec);
2613 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
2614 round_offset_vec);
2615 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
2616 round_offset_vec);
2617 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
2618 round_offset_vec);
2619 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
2620 y_filter, round_offset_vec);
2621 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
2622 y_filter, round_offset_vec);
2623 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
2624 y_filter, round_offset_vec);
2625 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
2626 y_filter, round_offset_vec);
2627
2628 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
2629
2630 s0 = s8;
2631 s1 = s9;
2632 s2 = s10;
2633 s3 = s11;
2634 s4 = s12;
2635 s5 = s13;
2636 s6 = s14;
2637 s += 8 * src_stride;
2638 d += 8 * dst_stride;
2639 height -= 8;
2640 #else // !AOM_ARCH_AARCH64
2641 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
2642
2643 __builtin_prefetch(dst_ptr);
2644
2645 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
2646 round_offset_vec);
2647
2648 s0 = s1;
2649 s1 = s2;
2650 s2 = s3;
2651 s3 = s4;
2652 s4 = s5;
2653 s5 = s6;
2654 s6 = s7;
2655
2656 vst1q_u16(d, d0);
2657
2658 s += src_stride;
2659 d += dst_stride;
2660 height--;
2661 #endif // AOM_ARCH_AARCH64
2662 } while (height != 0);
2663 src_ptr += 8;
2664 dst_ptr += 8;
2665 width -= 8;
2666 } while (width != 0);
2667 }
2668 }
2669
av1_dist_wtd_convolve_y_neon(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)2670 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
2671 uint8_t *dst8, int dst8_stride, int w, int h,
2672 const InterpFilterParams *filter_params_y,
2673 const int subpel_y_qn,
2674 ConvolveParams *conv_params) {
2675 assert(w % 4 == 0);
2676 assert(h % 4 == 0);
2677
2678 // Vertical filter.
2679 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
2680 filter_params_y, subpel_y_qn & SUBPEL_MASK);
2681 // Filter values are even, so downshift by 1 to reduce intermediate
2682 // precision requirements.
2683 const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
2684
2685 const int vert_offset = filter_params_y->taps / 2 - 1;
2686 const uint8_t *src_ptr = src - (vert_offset * src_stride);
2687
2688 if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
2689 if (conv_params->do_average) {
2690 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
2691 dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
2692 src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
2693 conv_params);
2694 } else {
2695 dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
2696 dst8, dst8_stride, w, h, y_filter,
2697 conv_params);
2698 }
2699 } else {
2700 dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
2701 y_filter, conv_params);
2702 }
2703 } else {
2704 if (conv_params->do_average) {
2705 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
2706 dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
2707 dst8_stride, w, h, y_filter,
2708 conv_params);
2709 } else {
2710 dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
2711 dst8_stride, w, h, y_filter,
2712 conv_params);
2713 }
2714 } else {
2715 dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
2716 conv_params);
2717 }
2718 }
2719 }
2720